Python _charset_decoderの例、lostphotosfound.utils._charset_decoder Pythonの例

コード例 #1

0

ファイルを表示

ファイル: server.py プロジェクト: caio1982/Lost-Photos-Found

    def lostphotosfound(self):
        """The actual program, which fetchs the mails and all its parts attachments"""

        messages = self._filter_messages()

        for msg in messages:
            try:
                idfetched = self._server.fetch([msg], ['X-GM-MSGID'])
            except:
                raise Exception('Could not fetch the message ID, server did not respond')

            msgid = str(idfetched[idfetched.keys()[0]]['X-GM-MSGID'])

            # mail has been processed in the past, skip it
            if self._use_index and msgid in self._index.keys():
                print 'Skipping X-GM-MSDID %s' % (msgid)
                continue

            # if it hasn't, fetch it and iterate through its parts
            msgdata = self._server.fetch([msg], ['RFC822'])

            for data in msgdata:
                try:
                    mail = message_from_string(msgdata[data]['RFC822'].encode('utf-8'))
                except UnicodeDecodeError:
                    print("Warning: can't encode message data to UTF-8")
                    mail = message_from_string(msgdata[data]['RFC822'])

                if mail.get_content_maintype() != 'multipart':
                    continue

                # logging
                header_from = _charset_decoder(mail['From'])
                header_subject = _charset_decoder(mail['Subject'])
                print '[%s]: %s' % (header_from, header_subject)

		# use raw header, header_from sometimes excludes the email address
		sender = email.utils.parseaddr(mail['From'])[1]
		if not sender:
			sender = 'unknown_sender'

                for part in mail.walk():
                    # if it's only plain text, i.e. no images
                    if part.get_content_maintype() == 'multipart':
                        continue
                    # if no explicit attachments unless they're inline
                    if part.get('Content-Disposition') is None:
                        pass
                    # if non-graphic inline data
                    if 'image/' not in part.get_content_type():
                        continue

                    # only then we can save this mail part
                    self._save_part(part, mail, sender)

                # all parts of mail processed, add it to the index
                self._index[msgid] = msgid

        self._cleanup()

コード例 #2

0

ファイルを表示

ファイル: server.py プロジェクト: thomasi/Lost-Photos-Found

    def lostphotosfound(self):
        """The actual program, which fetchs the mails and all its parts attachments"""

        messages = self._filter_messages()

        for msg in messages:
            try:
                idfetched = self._server.fetch([msg], ['X-GM-MSGID'])
            except:
                raise Exception(
                    'Could not fetch the message ID, server did not respond')

            msgid = str(idfetched[idfetched.keys()[0]]['X-GM-MSGID'])

            # mail has been processed in the past, skip it
            if msgid in self._index.keys():
                print 'Skipping X-GM-MSDID %s' % (msgid)
                continue

            # if it hasn't, fetch it and iterate through its parts
            msgdata = self._server.fetch([msg], ['RFC822'])

            for data in msgdata:
                mail = message_from_string(
                    msgdata[data]['RFC822'].encode('utf-8'))
                if mail.get_content_maintype() != 'multipart':
                    continue

                # logging
                header_from = _charset_decoder(mail['From'])
                header_subject = _charset_decoder(mail['Subject'])
                print '[%s]: %s' % (header_from, header_subject)

                for part in mail.walk():
                    # if it's only plain text, i.e. no images
                    if part.get_content_maintype() == 'multipart':
                        continue
                    # if no explicit attachments unless they're inline
                    if part.get('Content-Disposition') is None:
                        pass
                    # if non-graphic inline data
                    if 'image/' not in part.get_content_type():
                        continue

                    # only then we can save this mail part
                    self._save_part(part, mail)

                # all parts of mail processed, add it to the index
                self._index[msgid] = msgid

        self._cleanup()

コード例 #3

0

ファイルを表示

ファイル: server.py プロジェクト: B-Rich/Lost-Photos-Found

    def _save_part(self, part, mail):
        """
        Internal function to decode attachment filenames and save them all

        @param mail: the mail object from message_from_string so it can checks its date
        @param part: the part object after a mail.walk() to get multiple attachments
        """

        if not hasattr(self, "seq"):
            self.seq = 0

        # we check if None in filename instead of just if it is None
        # due to the type of data decode_header returns to us
        header_filename = _charset_decoder(part.get_filename())

        # i.e. some inline attachments have no filename field in the header
        # so we have to hack around it and get the name field
        if 'None' in header_filename:
            header_filename = part.get('Content-Type').split('name=')[-1].replace('"', '')
        elif not header_filename[0][0] or header_filename[0][0] is None:
            # we should hopefully never reach this, attachments would be 'noname' in gmail
            header_filename = 'attachment-%06d.data' % (self.seq)
            self.seq += 1

        # sanitize it
        punct = '!"#$&\'*+/;<>?[\]^`{|}~'
        header_filename = header_filename.translate(None, punct)

        # 2012-10-28_19-15-22 (Y-M-D_H-M-S)
        header_date = parsedate(mail['date'])
        header_date = '%s-%s-%s_%s-%s-%s_' % (header_date[0],
                                              header_date[1],
                                              header_date[2],
                                              header_date[3],
                                              header_date[4],
                                              header_date[5])
        filename = header_date + header_filename

        # we should create it in the documents folder
        username = self._username
        userdir = os.path.expanduser('~/LostPhotosFound')
        savepath = os.path.join(userdir, username)
        if not os.path.isdir(savepath):
            os.makedirs(savepath)

        # logging complement
        print '\t...%s' % (filename)

        saved = os.path.join(savepath, filename)
        if not os.path.isfile(saved):
            with open(saved, 'wb') as imagefile:
                try:
                    payload = part.get_payload(decode=True)
                except:
                    message = 'Failed when downloading attachment: %s' % (saved)
                    raise Exception(message)

                payload_hash = hashlib.sha1(payload).hexdigest()
                # gmail loves to duplicate attachments in replies
                if payload_hash not in self._hashes.keys():
                    try:
                        imagefile.write(payload)
                    except:
                        message = 'Failed writing attachment to file: %s' % (saved)
                        raise Exception(message)
                    self._hashes[payload_hash] = payload_hash
                else:
                    print 'Duplicated attachment %s (%s)' % (saved, payload_hash)
                    os.remove(saved)

コード例 #4

0

ファイルを表示

ファイル: server.py プロジェクト: thomasi/Lost-Photos-Found

    def _save_part(self, part, mail):
        """
        Internal function to decode attachment filenames and save them all

        @param mail: the mail object from message_from_string so it can checks its date
        @param part: the part object after a mail.walk() to get multiple attachments
        """

        if not hasattr(self, "seq"):
            self.seq = 0

        # we check if None in filename instead of just if it is None
        # due to the type of data decode_header returns to us
        header_filename = _charset_decoder(part.get_filename())

        # i.e. some inline attachments have no filename field in the header
        # so we have to hack around it and get the name field
        if 'None' in header_filename:
            header_filename = part.get('Content-Type').split(
                'name=')[-1].replace('"', '')
        elif not header_filename[0][0] or header_filename[0][0] is None:
            # we should hopefully never reach this, attachments would be 'noname' in gmail
            header_filename = 'attachment-%06d.data' % (self.seq)
            self.seq += 1

        # sanitize it
        punct = '!"#$&\'*+/;<>?[\]^`{|}~'
        header_filename = header_filename.translate(None, punct)

        # 2012-10-28_19-15-22 (Y-M-D_H-M-S)
        header_date = parsedate(mail['date'])
        header_date = '%s-%s-%s_%s-%s-%s_' % (header_date[0], header_date[1],
                                              header_date[2], header_date[3],
                                              header_date[4], header_date[5])
        filename = header_date + header_filename

        # we should create it in the documents folder
        username = self._username
        userdir = os.path.expanduser('~/LostPhotosFound')
        savepath = os.path.join(userdir, username)
        if not os.path.isdir(savepath):
            os.makedirs(savepath)

        # logging complement
        print '\t...%s' % (filename)

        saved = os.path.join(savepath, filename)
        if not os.path.isfile(saved):
            with open(saved, 'wb') as imagefile:
                try:
                    payload = part.get_payload(decode=True)
                except:
                    message = 'Failed when downloading attachment: %s' % (
                        saved)
                    raise Exception(message)

                payload_hash = hashlib.sha1(payload).hexdigest()
                # gmail loves to duplicate attachments in replies
                if payload_hash not in self._hashes.keys():
                    try:
                        imagefile.write(payload)
                    except:
                        message = 'Failed writing attachment to file: %s' % (
                            saved)
                        raise Exception(message)
                    self._hashes[payload_hash] = payload_hash
                else:
                    print 'Duplicated attachment %s (%s)' % (saved,
                                                             payload_hash)
                    os.remove(saved)