Python BytesParser.walkの例、email.parser.BytesParser.walk Pythonの例

コード例 #1

0

ファイルを表示

ファイル: envelope.py プロジェクト: sk1p/python-slimta

    def _encode_parts(self, header_data, msg_data, encoder):
        """Encodes any MIME part in the current message that is 8-bit.

        :type header_data: :py:obj:`bytes`
        :type msg_data: :py:obj:`bytes`
        """
        self.headers = None
        self.message = None

        if six.PY3:
            msg = BytesParser().parsebytes(header_data+msg_data)

        else:
            msg = Parser().parsestr(header_data+msg_data)

        for part in msg.walk():
            if not part.is_multipart():
                payload = part.get_payload()
                try:
                    payload.encode('ascii')
                except UnicodeError:
                    del part['Content-Transfer-Encoding']
                    encoder(part)

        self.parse_msg(msg)

コード例 #2

0

ファイルを表示

ファイル: server.py プロジェクト: dsynkd/iris.vim

def _get_email_content(uid, data):
    content = dict(text=None, html=None, attachments=[])
    email = BytesParser(policy=policy.default).parsebytes(data)

    for part in email.walk():
        if part.is_multipart():
            continue

        if part.is_attachment():
            content['attachments'].append(_read_attachment(part, uid))
            continue

        if part.get_content_type() == 'text/plain':
            content['text'] = _read_text(part)
            continue

        if part.get_content_type() == 'text/html':
            content['html'] = _read_html(part, uid)
            continue

    if content['html'] and not content['text']:
        tmp = open(content['html'], 'r')
        content['text'] = tmp.read()
        tmp.close()

    return content

コード例 #3

0

ファイルを表示

ファイル: FITEM_5.py プロジェクト: youssriaboelseod/omflow

def get_email(num, conn):
    result = {}
    typ, content = conn.fetch(num, '(RFC822)')
    msg = BytesParser().parsebytes(content[0][1])
    sub = msg.get('Subject')
    from_ = msg.get("From")
    # Body details
    result["From"] = decode_str(from_, "From")
    result["Subject"] = decode_str(sub, "Subject")
    result["File"] = []
    for part in msg.walk():
        if part.get_content_type() == "text/plain":
            body = part.get_payload(decode=True)
            charsets = part.get_charsets()
            result["Body"] = body.decode(charsets[0])
        fileName = part.get_filename()
        if None != fileName:
            file_dict = {}
            file_dict["name"] = decode_str(fileName, "File")
            file_dict["attachment"] = part.get_payload(decode=True)
            file_dict["content_type"] = part.get_content_type()
            new_file = ContentFile(file_dict["attachment"])
            file_obj = UploadedFile(new_file, file_dict["name"],
                                    file_dict["content_type"], new_file.size,
                                    None, None)
            result["File"].append(file_obj)


#                 fileName_str = decode_str(fileName,"File")
#                 att_path = os.path.join(settings.LOG_DIR,fileName_str)
#result["File"] = part.get_payload(decode=True)
#                 fp = open(att_path, 'wb')
#                 fp.write(part.get_payload(decode=True))
#                 fp.close()
    return result

コード例 #4

0

ファイルを表示

ファイル: etm-n.py プロジェクト: 0x024/etm

def get_content(num):
    print(num)
    type, data = raw_conn.fetch(num, '(RFC822)')
    email_date = get_date(email_list[int(count)])
    try:
        msg = BytesParser().parsebytes(data[0][1])
        for part in msg.walk():
            if not part.is_multipart():
                charset = part.get_charset()
                contenttype = part.get_content_type()
                content = part.get_payload(decode=True)
                content = content.decode('GBK')
                temp = time_formate(email_date)
                print(temp)
                if temp == '1':
                    print(temp)
                    get_transfer_v1(content)
                elif temp == '2':
                    print(temp)
                    get_transfer_v2(content)

                #
                #print (content)

    except TypeError:
        print('empty-email')
    except UnicodeDecodeError:
        print('hahah')

コード例 #5

0

ファイルを表示

    def fetch_and_parse(uids):
        ''' fetches and parses up to "commit_limit" new emails '''

        result = list()

        for uid in uids:
            email_dict = dict()
            reply, email_data = imap_server.uid('fetch', uid, '(RFC822)')
            if reply == 'OK':
                raw_email = email_data[0][1]
                email = BytesParser(policy=default).parsebytes(raw_email)
                email_dict['Date'] = datetime.strptime(
                    email['Date'], '%a, %d %b %Y %H:%M:%S %z')

                for header in [
                        'From', 'To', 'Delivered-To', 'Message-ID', 'Subject'
                ]:
                    email_dict[header] = email[header]
                email_dict['plain'] = None
                email_dict['html'] = None
                for part in email.walk():
                    if part.get_content_type() == 'text/html':
                        email_dict['html'] = part.get_body().get_content()
                    elif part.get_content_type() == 'text/plain':
                        email_dict['plain'] = part.get_body().get_content()
                result.append(email_dict)

        return result

コード例 #6

0

ファイルを表示

ファイル: envelope.py プロジェクト: thestick613/python-slimta

    def _encode_parts(self, header_data, msg_data, encoder):
        """Encodes any MIME part in the current message that is 8-bit.

        :type header_data: :py:obj:`bytes`
        :type msg_data: :py:obj:`bytes`
        """
        self.headers = None
        self.message = None

        if six.PY3:
            msg = BytesParser().parsebytes(header_data+msg_data)

        else:
            msg = Parser().parsestr(header_data+msg_data)

        for part in msg.walk():
            if not part.is_multipart():
                payload = part.get_payload()
                try:
                    payload.encode('ascii')
                except UnicodeError:
                    del part['Content-Transfer-Encoding']
                    encoder(part)

        self.parse_msg(msg)

コード例 #7

0

ファイルを表示

    def __init__(self, data, group=None):
        """
        Cribbed heavily from
        https://www.ianlewis.org/en/parsing-email-attachments-python
        """

        Loggable.__init__(self, group=group)

        self.subject = None
        self.time = None
        self.attachment = None

        message = BytesParser(policy=policy.default).parsebytes(data)
        self.subject = str(message["Subject"]).replace("\r\n", "")
        self.body = str(message.get_body())

        self.check_subject()
        self.check_body()

        self._set_time(message)

        self.log("info", 'Importing email: "{}"'.format(self.subject))

        attachments = []
        for part in message.walk():

            content_disposition = part.get("Content-Disposition")
            if not content_disposition:
                continue

            dispositions = content_disposition.strip().split(";")
            if len(dispositions) < 2:
                continue

            if not dispositions[0].lower() == "attachment" and \
               "filename" not in dispositions[1].lower():
                continue

            file_data = part.get_payload()

            attachments.append(
                Attachment(b64decode(file_data),
                           content_type=part.get_content_type()))

        if len(attachments) == 0:
            raise InvalidMessageError(
                "There don't appear to be any attachments to this message")

        if len(attachments) > 1:
            raise InvalidMessageError(
                "There's more than one attachment to this message. It cannot "
                "be indexed automatically.")

        self.attachment = attachments[0]

コード例 #8

0

ファイルを表示

def get_content(num):
	print (num)
	type,data=raw_conn.fetch(num,'(RFC822)')
	msg=BytesParser().parsebytes(data[0][1])
	for part in msg.walk():
		if not part.is_multipart():   
			charset = part.get_charset()
			contenttype = part.get_content_type()
			content=part.get_payload(decode=True)
			content=content.decode('GBK')
			#get_transfer_v1(content)
			print (content)

コード例 #9

0

ファイルを表示

ファイル: deprecated.py プロジェクト: Prajakta16/Spam-email-classifier

def email_parser(email_file):
    html_flag = 0
    with open(email_file, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)
    # print('Subject:', msg['subject'])

    if not msg.is_multipart():
        # print("Singular email")
        if msg.get_content_maintype() == "text":
            if msg.get_content_subtype() == "plain":
                # print(msg.get_content_type())
                body = msg.get_body(preferencelist='text/plain')
                # print(body)
            elif msg.get_content_subtype() == "html":
                # print(msg.get_content_type())
                body = msg.get_body(preferencelist='html')
                # print("----Body from get_body()-------")
                # print(body)
                html_body = str(body).split("\n")[3:]
                html_body = '\n'.join(html_body)
                # print("----Parsed text through beautiful soup-------")
                body = html_parse(html_body)
                # print(body)
            else:
                print("Don't know if html or text {}".format(
                    msg.get_content_subtype()))
    else:
        print("Email is multipart")
        i = 0
        for part in msg.walk():
            i = i + 1
            print("part " + str(i))
            cdispo = str(part.get('Content-Disposition'))
            print(cdispo)
            print(part.get_content_type())
            print(part.get_content_subtype())
            if part.get_content_type(
            ) == 'multipart/alternative' or part.get_content_type(
            ) == 'multipart/related':
                body = part.get_body(preferencelist='html')
                print("----Body from get_body()-------")
                print(body)
                html_body = str(body).split("\n")[3:]
                html_body = '\n'.join(html_body)
                print("----Parsed text through beautiful soup-------")
                body = html_parse(html_body)
                print(body)
            if part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)  # decode
                print(body)
                break
    return body

コード例 #10

0

ファイルを表示

ファイル: email.py プロジェクト: larsborn/refinery

    def _get_parts_regular(self, data):
        msg = BytesParser().parsebytes(data)

        yield from self._get_headparts(msg.items())

        for part in msg.walk():
            path = part.get_filename()
            data = part.get_payload(decode=True)
            if data is None:
                continue
            if path is None:
                path = F'BODY.{file_extension(part.get_content_subtype(), "TXT").upper()}'
            yield UnpackResult(path, data)

コード例 #11

0

ファイルを表示

ファイル: getmail.py プロジェクト: 0x024/etm

def get_content(num):
	print (num)
	type,data=conn.fetch(num,'(RFC822)')
	msg=BytesParser().parsebytes(data[0][1])
	for part in msg.walk():
		if not part.is_multipart():   
			charset = part.get_charset()
			contenttype = part.get_content_type()
			content=part.get_payload(decode=True)
			content=content.decode('GBK')
			#print (content)
			with open("messy.log","w") as f:
				f.write(content)

コード例 #12

0

ファイルを表示

ファイル: mail.py プロジェクト: XanderDwyl/paperless

    def __init__(self, data, verbosity=1):
        """
        Cribbed heavily from
        https://www.ianlewis.org/en/parsing-email-attachments-python
        """

        self.verbosity = verbosity

        self.subject = None
        self.time = None
        self.attachment = None

        message = BytesParser(policy=policy.default).parsebytes(data)
        self.subject = str(message["Subject"]).replace("\r\n", "")
        self.body = str(message.get_body())

        self.check_subject()
        self.check_body()

        self._set_time(message)

        Log.info(
            'Importing email: "{}"'.format(self.subject), Log.COMPONENT_MAIL)

        attachments = []
        for part in message.walk():

            content_disposition = part.get("Content-Disposition")
            if not content_disposition:
                continue

            dispositions = content_disposition.strip().split(";")
            if not dispositions[0].lower() == "attachment":
                continue

            file_data = part.get_payload()

            attachments.append(Attachment(
                b64decode(file_data), content_type=part.get_content_type()))

        if len(attachments) == 0:
            raise InvalidMessageError(
                "There don't appear to be any attachments to this message")

        if len(attachments) > 1:
            raise InvalidMessageError(
                "There's more than one attachment to this message. It cannot "
                "be indexed automatically."
            )

        self.attachment = attachments[0]

コード例 #13

0

ファイルを表示

ファイル: api.py プロジェクト: dtrckd/iris.vim

def download_attachments(dir, uid, data):
    attachments = []
    email = BytesParser(policy=policy.default).parsebytes(data)

    for part in email.walk():
        if part.is_attachment():
            attachment_name = part.get_filename()
            attachment = open(
                os.path.expanduser(os.path.join(dir, attachment_name)), "wb")
            attachment.write(part.get_payload(decode=True))
            attachment.close()
            attachments.append(attachment_name)

    return attachments

コード例 #14

0

ファイルを表示

def process_email(file_name):
    """Process parts of a MIME message store in file."""

    with open(file_name, "rb") as fp:
        msg = BytesParser(policy=policy.default).parse(fp)
        for part in msg.walk():
            debug(f"{part=}")
            msg_content_type = part.get_content_subtype()
            if msg_content_type == "html":
                debug(f"part is HTML: %s" % msg_content_type)
                charset = part.get_content_charset(failobj="utf-8")
                content = part.get_payload(decode=True).decode(
                    charset, "replace")
                return content

コード例 #15

0

ファイルを表示

def get_content(num):
    print(num)
    type, data = raw_conn.fetch(num, '(RFC822)')
    try:
        msg = BytesParser().parsebytes(data[0][1])
        for part in msg.walk():
            if not part.is_multipart():
                charset = part.get_charset()
                contenttype = part.get_content_type()
                content = part.get_payload(decode=True)
                content = content.decode('GBK')
                get_transfer_v1(content)
                #print (content)
    except TypeError:
        print('empty-email')
    except UnicodeDecodeError:
        print('hahah')

コード例 #16

0

ファイルを表示

def decode_eml(dir, filename):  # './xxx/'
    print(
        '-------------------------------------------------------------------')
    print('Decoding: ' + dir + filename + "\n")
    # with open(dir + filename, 'r') as fp:
    fp = open(dir + filename, 'rb')  # b => bytes
    msg = BytesParser(policy=policy.default).parse(fp)
    _from = msg.get('From')
    _to = msg.get('To')
    _subject = msg.get('Subject')
    print('From: ' + _from)
    print('To: ' + _to)
    print('Subject: ' + _subject + '\n')

    fp = open(dir + filename, 'r')
    msg = email.message_from_file(fp)
    for par in msg.walk():  # 对于每一个MIME块
        if not par.is_multipart():
            content_type = par.get('Content-Type')
            print('content_type: ' + content_type)
            name = par.get_param('filename')
            if name:
                h = Header(name)  # 解码奇怪的文件名
                dh = decode_header(h)
                fname = dh[0][0]  # 附件名
                print('附件:', str(fname, encoding='utf-8') + '\n')
                data = par.get_payload(decode=True)
                try:
                    f = open(dir + str(fname, encoding='utf-8'),
                             'wb')  # 注意一定要用wb来打开文件，因为附件一般都是二进制文件
                    f.write(data)
                    f.close()
                except:
                    print('error: 附件名含非法字符，存为tmp')
                    f = open('tmp', 'wb')
                    f.write(data)
                    f.close()
            else:
                print(
                    '文本内容: ',
                    str(par.get_payload(decode=True), encoding='utf-8') + '\n')

    fp.close()
    print(
        '--------------------------------End--------------------------------\n'
    )

コード例 #17

0

ファイルを表示

ファイル: concrete_file_preprocessors.py プロジェクト: EveryTimeIWill18/Unstructured_Pipeline_05_23_2019

    def extract_text(self, current_file) -> dict:
        """Extract the current email's text"""
        try:
            with open(current_file, 'rb') as eml_f:
                msg = BytesParser(policy=policy.default).parse(eml_f)
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == 'text/html':
                            soup = BeautifulSoup(part.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text

                            # process the text list into a formatted string
                            body = ' '.join(body) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            body = SPACES.sub(" ", body)
                            body = NEWLINE.sub("", body)
                            body = TABS.sub(" ", body)
                            body = ''.join(
                                [i if ord(i) < 128 else ' ' for i in body])
                            #NOTE: update for dms_claims project (5/17/19)
                            if self.project == 'dms_claims':
                                self.mapping_dict.update({})
                            #NOTE: END//

                            self.mapping_dict.update(
                                {os.path.basename(current_file): body})
                            self.file_counter += 1
                            return {os.path.basename(current_file): body}
        except OSError as e:
            if current_file in self.error_files:
                pass
            else:
                self.error_file_counter += 1
                self.error_files.append(
                    os.path.basename(current_file))  # added: 4/16/2019
                #logger.error(error=f'OSError: Could not parse email: {os.path.basename(current_file)}')
                #logger.error(error=f"Python Exception: {e}") # added: 5/1/2019
        except Exception as e:  # added: 5/1/2019
            if current_file in self.error_files:
                pass
            else:
                self.error_file_counter += 1
                self.error_files.append(os.path.basename(current_file))

コード例 #18

0

ファイルを表示

def get_email(num, conn):

    typ, content = conn.fetch(num, '(RFC822)')
    msg = BytesParser().parsebytes(content[0][1])
    #print(msg)
    sub = msg.get('Subject')
    sender = msg.get('X-Sender')
    date = msg.get('Date')
    for part in msg.walk():
        # fileName = part.get_filename()
        # fileName = decode_str(fileName)
        # if None != fileName:
        #     print('+++++++++++++++++++')
        #     print(fileName)
        if not part.is_multipart():
            #print('+++++++++++++++++++')
            #print(part.get_payload(decode=True).decode('utf-8'))
            print(num, decode_str(sub), decode_str(sender), decode_str(date))
            return part.get_payload(decode=True).decode('utf-8')

コード例 #19

0

ファイルを表示

ファイル: lpserver_changed.py プロジェクト: pavelvizir/zmq_playground

def parse_email(raw_email_decoded):
    ''' parse email '''

    email_dict = dict()
    raw_email = raw_email_decoded.encode()
    email = BytesParser(policy=default).parsebytes(raw_email)
    email_dict['Date'] = datetime.strptime(email['Date'],
                                           '%a, %d %b %Y %H:%M:%S %z')

    for header in ['From', 'To', 'Delivered-To', 'Message-ID', 'Subject']:
        email_dict[header] = email[header]
    email_dict['plain'] = None
    email_dict['html'] = None
    for part in email.walk():
        if part.get_content_type() == 'text/html':
            email_dict['html'] = part.get_body().get_content()
        elif part.get_content_type() == 'text/plain':
            email_dict['plain'] = part.get_body().get_content()

    return email_dict

コード例 #20

0

ファイルを表示

ファイル: email.py プロジェクト: binref/refinery

    def _get_parts_regular(self, data):
        if not re.match(BR'^[\s!-~]+$', data):
            raise ValueError('This is not a plaintext email message.')

        msg = BytesParser().parsebytes(data)

        yield from self._get_headparts(msg.items())

        for k, part in enumerate(msg.walk()):
            path = part.get_filename()
            elog = None
            if path is None:
                extension = file_extension(part.get_content_type(), 'txt')
                path = F'body.{extension}'
            else:
                path = F'attachments/{path}'
            try:
                data = part.get_payload(decode=True)
            except Exception as E:
                try:
                    data = part.get_payload(decode=False)
                except Exception as E:
                    elog = str(E)
                    data = None
                else:
                    from refinery import carve
                    self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}')
                    if isinstance(data, str):
                        data = data.encode('latin1')
                    if isbuffer(data):
                        data = next(data | carve('b64', stripspace=True, single=True, decode=True))
                    else:
                        elog = str(E)
                        data = None
            if not data:
                if elog is not None:
                    self.log_warn(F'could not get content of message part {k}: {elog!s}')
                continue
            yield UnpackResult(path, data)

コード例 #21

0

ファイルを表示

ファイル: api.py プロジェクト: dtrckd/iris.vim

def get_email_content(uid, data):
    content = dict(text=None, html=None)
    email = BytesParser(policy=policy.default).parsebytes(data)

    for part in email.walk():
        if part.is_multipart():
            continue

        if part.get_content_type() == "text/plain":
            content["text"] = read_text(part)
            continue

        if part.get_content_type() == "text/html":
            content["html"] = read_html(part, uid)
            continue

    if content["html"] and not content["text"]:
        tmp = open(content["html"], "r")
        content["text"] = tmp.read()
        tmp.close()

    return content

コード例 #22

0

ファイルを表示

ファイル: common.py プロジェクト: depeche-protocol/dede

def parse_message(db: SqliteStorage, crypto: ProviderNaCl, message: StoredMessage):
    """
    This function will take a message and return the cleartext contents
    of the message as well as any protocol attachments contained in the
    message: Address pads and requests for pads
    """
    key_id, private_key = db.get_own_address_nacl_key(message.header_address)
    if not private_key:
        # We are trying to parse a message for which we have no key.
        # This is never going to work out well, better to exit early.
        return (None, None, None)

    cleartext = crypto.decrypt(message.contents, private_key)

    # Cleartext is supposed to be a MIME formatted message
    msg = BytesParser(policy=policy.default).parsebytes(cleartext)
    content = []
    address_pad = None
    address_pad_req = None

    for part in msg.walk():
        # Account for stuff we know will turn up - Specifically wrappers and protocol
        # control messages.
        # Please note that we do not currently support multiple address pads / requests in the
        # same message.
        if part.get_content_type() == 'application/json':
            if part['Content-Description'] == NodeIntercom.address_pad_request_description:
                address_pad_req = NodeIntercom.AddressPadRequest.deserialize(part.get_content())
            if part['Content-Description'] == NodeIntercom.address_pad_description:
                address_pad = NodeIntercom.AddressPad.deserialize(part.get_content())
        elif (part.get_content_maintype() == 'multipart' or
              part.get_content_maintype() == 'application'):
            continue
        else:
            content.append(part.get_content())

    msg_string = "From: {0}\nTo: {1}\n\n{2}".format(msg['from'], msg['to'], "\n".join(content))
    return (msg_string, address_pad_req, address_pad)

コード例 #23

0

ファイルを表示

ファイル: parser.py プロジェクト: pavelvizir/project

def parse_email(raw_emails):
    ''' parse email '''
    emails = list()

    for uid, length, raw_headers, raw_email in raw_emails:
        email_dict = dict()
        email = BytesParser(policy=default).parsebytes(raw_email)
        headers = BytesParser(policy=default).parsebytes(raw_headers)
        email_dict['uid'] = uid.decode()
        email_dict['length'] = length.decode()
        email_dict['Date'] = datetime.strptime(headers['Date'],
                                               '%a, %d %b %Y %H:%M:%S %z')
        email_dict['metadata'] = dict()

        for header in ['From', 'To', 'Delivered-To', 'Message-ID', 'Subject']:
            email_dict['metadata'][header] = headers[header]
        email_dict['plain'] = None
        email_dict['html'] = None
        email_dict['attachments'] = list()

        for part in email.walk():
            # if not part.get('Content-Disposition'):
            if not part.is_attachment():  # get('Content-Disposition'):
                if part.get_content_type() == 'text/html':
                    email_dict['html'] = part.get_body().get_content()
                elif part.get_content_type() == 'text/plain':
                    email_dict['plain'] = part.get_body().get_content()
            else:
                attachment = dict()
                attachment['MIME'] = part.get_content_type()
                attachment['filename'] = part.get_filename()
                attachment['body'] = part.get_content()
                email_dict['attachments'].append(attachment)

        emails.append(email_dict)

    return emails

コード例 #24

0

ファイルを表示

    def extract_text(self, current_file: str) -> dict:
        try:
            with open(current_file, 'rb') as eml_file:
                #logger.info(info=f'Eml file: {os.path.basename(current_file)}')
                msg = BytesParser(policy=policy.default).parse(eml_file)
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == 'text/html':
                            soup = BeautifulSoup(part.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text

                            # check if the body of the eml file is None or 0
                            if not body:
                                self.error_file_counter += 1
                                self.error_files.append(
                                    os.path.basename(current_file))
                                return f"No text body in email: {os.path.basename(current_file)}"
                            else:
                                # process the text list into a formatted string
                                body = ' '.join(body) \
                                    .translate(str.maketrans('', '', string.punctuation)) \
                                    .lower()
                                body = SPACES.sub(" ", body)
                                body = NEWLINE.sub("", body)
                                body = TABS.sub(" ", body)
                                body = ''.join(
                                    [i if ord(i) < 128 else ' ' for i in body])
                                #print(f"body := {body}")
                                # UPDATE: added 6/20/2019
                                if len(body) == 0:
                                    # not text was extracted from this file; add to error files list
                                    self.error_file_counter += 1
                                    self.error_files.append(
                                        os.path.basename(current_file))
                                    logger.error(
                                        error=
                                        f"Eml file: {os.path.basename(current_file)} has no text body."
                                    )

                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                else:
                    # UPDATE: added 6/20/2019
                    # if email is not multipart, we can extract the text directly
                    try:
                        if msg.get_content_type() == 'text/html':
                            soup = BeautifulSoup(msg.get_content(),
                                                 'html.parser')
                            body = soup.findAll(text=True)  # extract the text
                            # process the text list into a formatted string
                            body = ' '.join(body) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            body = SPACES.sub(" ", body)
                            body = NEWLINE.sub("", body)
                            body = TABS.sub(" ", body)
                            body = ''.join(
                                [i if ord(i) < 128 else ' ' for i in body])
                            #print(f"body := {body}")
                            # update the mapping dict if the file is not currently in the mapping dictionary
                            if os.path.basename(
                                    current_file
                            ) not in self.mapping_dict.keys():
                                self.mapping_dict[os.path.basename(
                                    current_file)] = body
                                self.file_counter += 1
                                return {os.path.basename(current_file): body}
                            else:
                                return f"Eml File: {os.path.basename(current_file)} has already been read in."
                    except Exception as e:
                        # NOTE: *added 06/28/2019*
                        self.error_file_counter += 1
                        self.error_files.append(os.path.basename(current_file))
                        logger.error(
                            error=
                            f'Eml file: {os.path.basename(current_file)} could not be text mined.'
                        )
                        logger.error(error=e)
        except (OSError, Exception) as e:
            # update the error file information
            self.error_file_counter += 1
            self.error_files.append(os.path.basename(current_file))
            logger.error(
                error=
                f'Eml file: {os.path.basename(current_file)} could not be text mined.'
            )
            logger.error(error=e)

コード例 #25

0

ファイルを表示

class EmlParser():
    def __init__(self, fileName):
        self.message = BytesParser(policy=policy.default).parsebytes(
            readFile(fileName))

    def getId(self):
        return getHashOfItem(self.message)

    def getAttachmentData(self, name):
        for part in self.message.walk():
            if 'content-disposition' not in part:
                continue

            cdisp = part['content-disposition'].split(';')
            cdisp = [x.strip() for x in cdisp]

            if cdisp[0].lower() != 'attachment':
                continue
            parsed = {}

            for kv in cdisp[1:]:
                if kv.startswith('filename='):
                    key, _, val = kv.partition('=')

                    if val.startswith('"'):
                        val = val.strip('"')
                    elif val.startswith("'"):
                        val = val.strip("'")

                    if (name == val):
                        return part.get_payload(decode=True)

        return None

    def getAttachmentNames(self):
        found = []

        for part in self.message.walk():
            if 'content-disposition' not in part:
                continue

            cdisp = part['content-disposition'].split(';')
            cdisp = [x.strip() for x in cdisp]

            if cdisp[0].lower() != 'attachment':
                continue
            parsed = {}

            for kv in cdisp[1:]:
                if kv.startswith('filename='):
                    key, _, val = kv.partition('=')

                    if val.startswith('"'):
                        val = val.strip('"')
                    elif val.startswith("'"):
                        val = val.strip("'")

                    found.append(val)

        return found

    def getPayloadHtml(self):
        body = self.message.get_body('html')

        if (body):
            return self._decode_body(body.get_payload(decode=True))

        return ''

    def getPayloadPlain(self):
        body = self.message.get_body('plain')

        if (body):
            return self._decode_body(body.get_payload(decode=True))

        return ''

    def getSender(self):
        return extractEmails(str(self.message['from']))

    def getReceivers(self):
        return extractEmails(str(self.message['to']))

    def getSubject(self):
        return self._decode_entry(self.message['Subject'])

    def getDate(self):
        dt = parse(self.message['Date'])
        return str(dt.date()) + " " + str(dt.time())

    def _decode_entry(self, entry):
        if entry is None:
            entry = ''
        else:
            result = ''
            for part in decode_header(entry):
                if isinstance(part[0], str):
                    result += part[0]
                else:
                    encoding = part[1]
                    result += part[0].decode(encoding)

            entry = result

        return entry

    def _decode_body(self, entry):
        try:
            entry = entry.decode('utf-8')
        except UnicodeDecodeError:
            entry = entry.decode('latin-1')

        return entry

コード例 #26

0

ファイルを表示

ファイル: parser.py プロジェクト: ww9rivers/c9r

    def __call__(self, content):
        '''Parse an email message in "content", which is a string or a text input object.

        /content/       Standard encoded email message content.

        Returns parsed message in a dict of (subject, date, body, html, from, to, attachments).
        '''
        if isinstance(content, bytes):
            msgobj = BytesParser().parsebytes(content)
        else:
            msgobj = StrParser().parse(StringIO(content))
        subject = parse_header('Subject', msgobj)
        date = parse_header('Date', msgobj)
        received = []
        for part in (msgobj.get_all('Received') or []):
            lx = self.re_received.split(part)
            tmp = dict(zip(lx[1::2], [ x.strip() for x in lx[2::2] ]))
            tx = tmp.get(';')
            if tx: tmp['time'] = parse_time(tx)
            received.append(tmp)
        fromaddr = parse_addr(msgobj, 'From')
        if date:
            date = date.replace(',', '')
        logger.debug('Parsing message: Date={0}, Subject={1}'.format(date, subject))
        #-------- Parsing attachments:
        attachments = []
        body = None
        html = None
        for part in msgobj.walk():
            attachment = parse_attachment(part)
            if attachment:
                attachments.append(attachment)
            else: # parse text content
                content_type = part.get_content_type()
                if content_type[0:5] == 'text/':
                    payload = str(part.get_payload(decode=True),
                                  part.get_content_charset() or 'ascii',
                                  'replace').encode('utf8','replace')
                if content_type == "text/plain":
                    if body is None:
                        body = ''
                    body += str(payload)
                elif content_type == "text/html":
                    if html is None:
                        html = ''
                    html += str(payload)
                else:
                    logger.debug('Ignored: Content_type "{0}" in message "{1}" from {2}, Date={3}'.format(content_type, subject, fromaddr, date))
        return {
            'subject' : subject,
            'date' : date,
            'received': received,
            # 'received': sorted(received, key=lambda k: k['time']),
            'body' : body,
            'html' : html,
            'from' : fromaddr,
            'to' : parse_addr(msgobj, 'To'),
            'cc' : parse_addr(msgobj, 'CC'),
            'bcc' : parse_addr(msgobj, 'BCC'),
            'attachments': attachments
            }

コード例 #27

0

ファイルを表示

ファイル: database.py プロジェクト: KajaDuff/flask_app_test

def display_eml(eml_filepath): ## -> treba vyladit!!!
    with open(eml_filepath, 'rb') as eml_file:

        msg = BytesParser(policy=policy.default).parse(eml_file)
        text = msg.get_body(preferencelist=('plain')).get_content()
        # sk = get_info_from_mail_field(msg['from'])
        # eml_output = eml_file.read()
        eml_output = msg
        # eml_output = msg #get_all('Content-Dispositio
        found = []
        for part in msg.walk():
            if 'content-disposition' not in part:
                continue
            cdisp = part['content-disposition'].split(';')
            cdisp = [x.strip() for x in cdisp]
            if cdisp[0].lower() != 'attachment':
                continue
            parsed = {}
            for kv in cdisp[1:]:
                key, val = kv.split('=')
                if val.startswith('"'):
                    val = val.strip('"')
                elif val.startswith("'"):
                    val = val.strip("'")
                parsed[key] = val
            found.append((parsed, part))
        eml_output = {
                     "Odesílatel": msg.get('From'),
                     "Příjemce": msg.get('To'),
                     "Datum": msg.get('Date'),
                     "Předmět": msg.get('Subject'),
                     "Text zprávy": msg.get_body(preferencelist=('plain')).get_content(),
                     "Přílohy": found #[0]
                     }
        #print('eml_output',eml_output, msg.get('Cc'))
        if msg.get_content_maintype() == 'multipart':  # <--zjisti zda potrebujes - jinak smaz
            # loop on the parts of the mail
            for part in msg.walk():
            # find the attachment part - so skip all the other parts
                if part.get_content_maintype() == 'multipart': continue
                if part.get_content_maintype() == 'text':
                    content = part.get_body(preferencelist=('plain'))
                    if content:
                        output = part.get_body(preferencelist=('plain')).get_content()
                    else:
                        output = None
                    continue
                if part.get('Content-Disposition') == 'inline': continue
                if part.get('Content-Disposition') is None: continue
                # save the attachment in the program directory
                result_dict = {
                     "Odesílatel": msg.get('From'),
                     "Příjemce": msg.get('To'),
                     "Datum": msg.get('Date'),
                     "Předmět": msg.get('Subject'),
                     "Text zprávy": output, #msg.get_body(preferencelist=('plain')).get_content(),
                     "Přílohy": part.get_all('Content-Disposition')
                     }
                #eml_output = result_dict
                #print('result_dict',result_dict)
    return eml_output

コード例 #28

0

ファイルを表示

def email_analysis(filename, exclude_private_ip):
    urlList = []
    domainList = []
    hopList = []
    hopListIP = []
    attachList = []
    data = {}
    data["data"] = []

    with open(filename, "rb") as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    if msg:
        # Identify each url or attachment reported in the eMail body
        for part in msg.walk():
            if part.get_content_type(
            ) == "text/plain" or part.get_content_type() == "text/html":
                extractor = URLExtract()
                urlList.extend(extractor.find_urls(part.get_content()))
            else:
                if part.get_filename():
                    attachList.append(part.get_filename())

        # Identify each domain reported in the eMail body
        for url in urlList:
            analyzeddomain = tldcache(url).registered_domain
            if analyzeddomain:
                domainList.append(analyzeddomain)

        # Remove Duplicate
        urlList = list(set(urlList))
        domainList = list(set(domainList))

        # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example:
        #
        # Sender Name: Mario Rossi <*****@*****.**>
        # Sender Mail: [email protected]

        if msg["From"]:
            mail_from = re.findall(
                "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["From"],
                re.IGNORECASE)
            mail_from = mail_from[-1]
        else:
            mail_from = ""

        if msg["Sender"]:
            mail_sender = msg["Sender"]
        else:
            mail_sender = ""

        if msg["Subject"]:
            mail_subject = msg["Subject"]
        else:
            mail_subject = ""

        if msg["X-Originating-IP"]:
            mail_xorigip = msg["X-Originating-IP"]
        else:
            mail_xorigip = ""

        data["data"].append({
            "Filename": os.path.basename(filename),
            "From": mail_from,
            "Sender": mail_sender,
            "Subject": mail_subject,
            "X-Originating-IP": mail_xorigip,
            "attachments": [],
            "relay_full": [],
            "relay_ip": [],
            "urls": [],
            "domains": []
        })

        # Identify each relay
        received = msg.get_all("Received")
        if received:
            received.reverse()
            for line in received:
                hops = re.findall(
                    "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)",
                    line, re.DOTALL | re.X)
                for hop in hops:

                    ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0],
                                              re.DOTALL | re.X)

                    # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8
                    ipv6_address = re.findall(
                        r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))",
                        hop[0], re.DOTALL | re.X)

                    if ipv4_address:
                        if ipaddress.ip_address(ipv4_address[0]):
                            if ipaddress.ip_address(
                                    ipv4_address[0]).is_private:
                                if not exclude_private_ip:
                                    hopListIP.append(ipv4_address[0])
                            else:
                                hopListIP.append(ipv4_address[0])

                    if ipv6_address:
                        if ipaddress.ip_address(ipv6_address[0]):
                            if ipaddress.ip_address(
                                    ipv6_address[0]).is_private:
                                if not exclude_private_ip:
                                    hopListIP.append(ipv6_address[0])
                            else:
                                hopListIP.append(ipv6_address[0])

                    if hop[0]:
                        hopList.append(hop[0])

        if attachList:
            data["data"][0]["attachments"].append(
                dict(zip(range(len(attachList)), attachList)))

        if hopList:
            data["data"][0]["relay_full"].append(
                dict(zip(range(len(hopList)), hopList)))

        if hopListIP:
            data["data"][0]["relay_ip"].append(
                dict(zip(range(len(hopListIP)), hopListIP)))

        if urlList:
            data["data"][0]["urls"].append(
                dict(zip(range(len(urlList)), urlList)))
            data["data"][0]["domains"].append(
                dict(zip(range(len(domainList)), domainList)))

        print(json.dumps(data, indent=4))

コード例 #29

0

ファイルを表示

def email_analysis(filename, exclude_private_ip, check_spf):
    urlList = []
    hopList = []
    hopListIP = []
    domainList = []
    attachmentsList = []
    hopListIPnoPrivate = []

    resultmeioc = {
        "filename": os.path.basename(filename),
        "from": None,
        "sender": None,
        "x-sender": None,
        "to": None,
        "cc": None,
        "bcc": None,
        "envelope-to": None,
        "delivered-to": None,
        "subject": None,
        "x-originating-ip": None,
        "relay_full": None,
        "relay_ip": None,
        "spf": None,
        "urls": None,
        "domains": None,
        "attachments": None
    }

    with open(filename, "rb") as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    if msg:

        #
        # Header analysis
        #

        if msg["From"]:
            # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example:
            #
            # Sender Name: Mario Rossi <*****@*****.**>
            # Sender Mail: [email protected]
            mail_from = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["From"], re.IGNORECASE)

            if mail_from:
                resultmeioc["from"] = mail_from[-1]

        if msg["Sender"]:
            mail_sender = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["Sender"], re.IGNORECASE)

            if mail_sender:
                resultmeioc["sender"] = mail_sender[-1]

        if msg["X-Sender"]:
            mail_xsender = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["X-Sender"], re.IGNORECASE)

            if mail_xsender:
                resultmeioc["x-sender"] = mail_xsender[-1]

        if msg["To"]:
            mail_to = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["To"], re.IGNORECASE)

            if mail_to:
                # Remove possible duplicates and create a numbered dictionary
                mail_to = dict(
                    zip(range(len(list(set(mail_to)))), list(set(mail_to))))
                resultmeioc["to"] = mail_to

        if msg["Bcc"]:
            resultmeioc["bcc"] = msg["Bcc"]

        if msg["Cc"]:
            mail_cc = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["Cc"], re.IGNORECASE)

            if mail_cc:
                # Remove possible duplicates and create a numbered dictionary
                mail_cc = dict(
                    zip(range(len(list(set(mail_cc)))), list(set(mail_cc))))
                resultmeioc["cc"] = mail_cc

        if msg["Envelope-to"]:

            mail_envelopeto = re.findall(
                "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}",
                msg["Envelope-to"], re.IGNORECASE)

            if mail_envelopeto:
                # Remove possible duplicates and create a numbered dictionary
                mail_envelopeto = dict(
                    zip(range(len(list(set(mail_envelopeto)))),
                        list(set(mail_envelopeto))))
                resultmeioc["envelope-to"] = mail_envelopeto

        if msg["Delivered-To"]:
            resultmeioc["delivered-to"] = msg["Delivered-To"]

        if msg["X-Originating-IP"]:
            # Usually the IP is in square brackets, I remove them if present.
            mail_xorigip = msg["X-Originating-IP"].replace("[", "").replace(
                "]", "")
            resultmeioc["x-originating-ip"] = mail_xorigip

        if msg["Subject"]:
            resultmeioc["subject"] = msg["Subject"]

        # Identify each relay
        received = msg.get_all("Received")
        if received:
            received.reverse()
            for line in received:
                hops = re.findall(
                    "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)",
                    line, re.DOTALL | re.X)
                for hop in hops:

                    ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0],
                                              re.DOTALL | re.X)

                    # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8
                    ipv6_address = re.findall(
                        r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))",
                        hop[0], re.DOTALL | re.X)

                    if ipv4_address:
                        for ipv4 in ipv4_address:
                            if ipaddress.ip_address(ipv4):
                                hopListIP.append(ipv4)
                                if not ipaddress.ip_address(ipv4).is_private:
                                    hopListIPnoPrivate.append(ipv4)

                    if ipv6_address:
                        for ipv6 in ipv6_address:
                            if ipaddress.ip_address(ipv6) and not "6::":
                                hopListIP.append(ipv6)

                                if not ipaddress.ip_address(ipv6).is_private:
                                    hopListIPnoPrivate.append(ipv6)

                    if hop[0]:
                        hopList.append(hop[0])

        if hopList:
            resultmeioc["relay_full"] = dict(zip(range(len(hopList)), hopList))

        if hopListIP:
            if exclude_private_ip:
                resultmeioc["relay_ip"] = dict(
                    zip(range(len(hopListIPnoPrivate)), hopListIPnoPrivate))
            else:
                resultmeioc["relay_ip"] = dict(
                    zip(range(len(hopListIP)), hopListIP))

        #
        # Body analysis
        #
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                # https://gist.github.com/dperini/729294
                urlList.extend(
                    re.findall(
                        "(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?",
                        part.get_content(),
                        re.UNICODE | re.IGNORECASE | re.MULTILINE))

            if part.get_content_type() == "text/html":
                # The try/except is necessary, if the body of the eMail contains an incorrect or unencoded HTML code the script freeezes.
                try:
                    soup = BeautifulSoup(part.get_content(), "html.parser")
                    tags = soup.find_all("a", href=True)
                    for url in tags:
                        urlList.append(url.get("href"))
                except:
                    pass

            if part.get_filename():
                if part.get_payload(decode=True):
                    filename = part.get_filename()
                    filemd5 = hashlib.md5(
                        part.get_payload(decode=True)).hexdigest()
                    filesha1 = hashlib.sha1(
                        part.get_payload(decode=True)).hexdigest()
                    filesha256 = hashlib.sha256(
                        part.get_payload(decode=True)).hexdigest()

                    attachmentsList.append({
                        "filename": filename,
                        "MD5": filemd5,
                        "SHA1": filesha1,
                        "SHA256": filesha256
                    })

        # Identify each domain reported in the eMail body
        for url in urlList:
            analyzeddomain = tldcache(url).registered_domain
            if analyzeddomain:
                domainList.append(analyzeddomain)

        # Remove Duplicate
        urlList = list(set(urlList))
        domainList = list(set(domainList))

        if urlList:
            resultmeioc["urls"] = dict(zip(range(len(urlList)), urlList))
            resultmeioc["domains"] = dict(
                zip(range(len(domainList)), domainList))

        if attachmentsList:
            resultmeioc["attachments"] = attachmentsList

        #
        # Verify the SPF record if requested
        #
        if check_spf:
            testspf = False
            resultspf = ""
            for ip in hopListIPnoPrivate:
                if not testspf and "mail_from" in locals():
                    resultspf = spf.check2(ip, mail_from[-1],
                                           mail_from[-1].split("@")[1])[0]
                    try:
                        resultspf = spf.check2(ip, mail_from[-1],
                                               mail_from[-1].split("@")[1])[0]
                    except:
                        pass

                    if resultspf == "pass":
                        testspf = True
                    else:
                        testspf = False

            resultmeioc["spf"] = testspf

        print(json.dumps(resultmeioc, indent=4))

コード例 #30

0

ファイルを表示

ファイル: email.py プロジェクト: binref/refinery-test

 def _get_parts_regular(self, data):
     msg = BytesParser().parsebytes(data)
     return [
         EmailPart(part.get_filename(), part.get_payload(decode=True))
         for part in msg.walk()
     ]

コード例 #31

0

ファイルを表示

ファイル: pop_test.py プロジェクト: mmxhxj/NA_PYTHON

# 获取指定邮件的内容（此处传入总长度，也就是获取最后一封邮件）
# 相当于发送POP 3的retr命令
# resp保存服务器的响应码
# data保存该邮件的内容
resp, data, octets = conn.retr(len(mails))
# 将data的所有数据（原本是一个字节列表）拼接在一起
msg_data = b'\r\n'.join(data)
# 将字符串内容解析成邮件，此处一定要指定policy=default
msg = BytesParser(policy=default).parsebytes(msg_data)  #①
print(type(msg))
print('发件人:' + msg['from'])
print('收件人:' + msg['to'])
print('主题:' + msg['subject'])
print('第一个收件人名字:' + msg['to'].addresses[0].username)
print('第一个发件人名字:' + msg['from'].addresses[0].username)
for part in msg.walk():
    counter = 1
    # 如果maintype是multipart，说明是容器（用于包含正文、附件等）
    if part.get_content_maintype() == 'multipart':
        continue
    # 如果maintype是multipart，说明是邮件正文部分
    elif part.get_content_maintype() == 'text':
        print(part.get_content())
    # 处理附件
    else:
        # 获取附件的文件名
        filename = part.get_filename()
        # 如果没有文件名，程序要负责为附件生成文件名
        if not filename:
            # 根据附件的contnet_type来推测它的后缀名
            ext = mimetypes.guess_extension(part.get_content_type())

コード例 #32

0

ファイルを表示

class EmailReader:
    """Creates an object for email parsing"""
    def __init__(self):
        self.emailPath = ""
        self.subjectField = ""
        self.fromField = ""
        self.toField = ""
        self.htmlBody = ""
        self.textBody = ""
        self.replyTo = ""
        self.returnPath = ""

    def readEmail(self, emailPath):
        """Reads an email for parsing"""
        f = open(emailPath, "rb")
        self.msg = BytesParser(policy=policy.default).parse(f)
        f.close()

    def getFrom(self, mode="address"):
        """Gets the from field.
        :param mode: what type of way in getting the from field
        address -> Returns only the address
        name -> Returns only the name
        full -> Returns both the name and address
        """
        fromField = self.msg["From"]
        if mode == "full":
            return fromField
        elif mode == "address":
            if "<" in fromField:
                temp = fromField.split("<")[-1][:-1]
                return temp
            else:
                return ""

        elif mode == "name":
            if "<" in fromField:
                temp = fromField.split("<")[0]
                return temp.strip()
            else:
                return ""
        else:
            raise Exception(
                "Parameter is undefined!\nAvailable options are only: \"address\", \"name\", and \"full\""
            )

    def getSubject(self):
        """Gets the subject field"""
        return self.msg["Subject"]

    def getReplyTo(self):
        """Gets the Reply-To field"""
        return self.msg["Reply-To"]

    def getReturnPath(self):
        """Gets the Return-Path field"""
        return self.msg["Return-Path"]

    def getHeader(self, header=""):
        """Gets any header"""
        if header == "":
            return ""
        else:
            try:
                return self.msg[header]
            except:
                return ""

    def getBody(self, mode="all"):
        """Gets the body.
        :param mode: what type of way in getting the email's body.
        all -> Returns both html and text
        html -> Returns only the html
        text -> Returns only the text
        """
        htmlBody = ""
        textBody = ""
        if self.msg.is_multipart():
            # Iterate for each part and check if it's the "body" part, text or html
            for part in self.msg.walk():
                # Check if its HTML and it is not an attachment
                if part.get_content_type(
                ) == "text/html" and part.get_content_disposition(
                ) != "attachment":
                    # Store the part in "s" variable in standard latin-1 encoding
                    self.htmlBody = part.get_payload(
                        decode=True).decode('ISO-8859-1')
                    htmlBody = self.htmlBody
                    # Since this is in HTML format, we need to strip all the HTML tags, we use BeautifulSoup
                    # For plain text and not an attachment
                if part.get_content_type(
                ) == "text/plain" and part.get_content_disposition(
                ) != "attachment":
                    # Place the text part to "s" variable in standard latin-1 encoding
                    self.textBody = part.get_payload(
                        decode=True).decode('ISO-8859-1')
                    textBody = self.textBody

        if mode == "all":
            return htmlBody, textBody
        elif mode == "html":
            return htmlBody
        elif mode == "text":
            return textBody