def lostphotosfound(self): """The actual program, which fetchs the mails and all its parts attachments""" messages = self._filter_messages() for msg in messages: try: idfetched = self._server.fetch([msg], ['X-GM-MSGID']) except: raise Exception('Could not fetch the message ID, server did not respond') msgid = str(idfetched[idfetched.keys()[0]]['X-GM-MSGID']) # mail has been processed in the past, skip it if self._use_index and msgid in self._index.keys(): print 'Skipping X-GM-MSDID %s' % (msgid) continue # if it hasn't, fetch it and iterate through its parts msgdata = self._server.fetch([msg], ['RFC822']) for data in msgdata: try: mail = message_from_string(msgdata[data]['RFC822'].encode('utf-8')) except UnicodeDecodeError: print("Warning: can't encode message data to UTF-8") mail = message_from_string(msgdata[data]['RFC822']) if mail.get_content_maintype() != 'multipart': continue # logging header_from = _charset_decoder(mail['From']) header_subject = _charset_decoder(mail['Subject']) print '[%s]: %s' % (header_from, header_subject) # use raw header, header_from sometimes excludes the email address sender = email.utils.parseaddr(mail['From'])[1] if not sender: sender = 'unknown_sender' for part in mail.walk(): # if it's only plain text, i.e. no images if part.get_content_maintype() == 'multipart': continue # if no explicit attachments unless they're inline if part.get('Content-Disposition') is None: pass # if non-graphic inline data if 'image/' not in part.get_content_type(): continue # only then we can save this mail part self._save_part(part, mail, sender) # all parts of mail processed, add it to the index self._index[msgid] = msgid self._cleanup()
def lostphotosfound(self): """The actual program, which fetchs the mails and all its parts attachments""" messages = self._filter_messages() for msg in messages: try: idfetched = self._server.fetch([msg], ['X-GM-MSGID']) except: raise Exception( 'Could not fetch the message ID, server did not respond') msgid = str(idfetched[idfetched.keys()[0]]['X-GM-MSGID']) # mail has been processed in the past, skip it if msgid in self._index.keys(): print 'Skipping X-GM-MSDID %s' % (msgid) continue # if it hasn't, fetch it and iterate through its parts msgdata = self._server.fetch([msg], ['RFC822']) for data in msgdata: mail = message_from_string( msgdata[data]['RFC822'].encode('utf-8')) if mail.get_content_maintype() != 'multipart': continue # logging header_from = _charset_decoder(mail['From']) header_subject = _charset_decoder(mail['Subject']) print '[%s]: %s' % (header_from, header_subject) for part in mail.walk(): # if it's only plain text, i.e. no images if part.get_content_maintype() == 'multipart': continue # if no explicit attachments unless they're inline if part.get('Content-Disposition') is None: pass # if non-graphic inline data if 'image/' not in part.get_content_type(): continue # only then we can save this mail part self._save_part(part, mail) # all parts of mail processed, add it to the index self._index[msgid] = msgid self._cleanup()
def _save_part(self, part, mail): """ Internal function to decode attachment filenames and save them all @param mail: the mail object from message_from_string so it can checks its date @param part: the part object after a mail.walk() to get multiple attachments """ if not hasattr(self, "seq"): self.seq = 0 # we check if None in filename instead of just if it is None # due to the type of data decode_header returns to us header_filename = _charset_decoder(part.get_filename()) # i.e. some inline attachments have no filename field in the header # so we have to hack around it and get the name field if 'None' in header_filename: header_filename = part.get('Content-Type').split('name=')[-1].replace('"', '') elif not header_filename[0][0] or header_filename[0][0] is None: # we should hopefully never reach this, attachments would be 'noname' in gmail header_filename = 'attachment-%06d.data' % (self.seq) self.seq += 1 # sanitize it punct = '!"#$&\'*+/;<>?[\]^`{|}~' header_filename = header_filename.translate(None, punct) # 2012-10-28_19-15-22 (Y-M-D_H-M-S) header_date = parsedate(mail['date']) header_date = '%s-%s-%s_%s-%s-%s_' % (header_date[0], header_date[1], header_date[2], header_date[3], header_date[4], header_date[5]) filename = header_date + header_filename # we should create it in the documents folder username = self._username userdir = os.path.expanduser('~/LostPhotosFound') savepath = os.path.join(userdir, username) if not os.path.isdir(savepath): os.makedirs(savepath) # logging complement print '\t...%s' % (filename) saved = os.path.join(savepath, filename) if not os.path.isfile(saved): with open(saved, 'wb') as imagefile: try: payload = part.get_payload(decode=True) except: message = 'Failed when downloading attachment: %s' % (saved) raise Exception(message) payload_hash = hashlib.sha1(payload).hexdigest() # gmail loves to duplicate attachments in replies if payload_hash not in self._hashes.keys(): try: imagefile.write(payload) except: message = 'Failed writing attachment to file: %s' % (saved) raise Exception(message) self._hashes[payload_hash] = payload_hash else: print 'Duplicated attachment %s (%s)' % (saved, payload_hash) os.remove(saved)
def _save_part(self, part, mail): """ Internal function to decode attachment filenames and save them all @param mail: the mail object from message_from_string so it can checks its date @param part: the part object after a mail.walk() to get multiple attachments """ if not hasattr(self, "seq"): self.seq = 0 # we check if None in filename instead of just if it is None # due to the type of data decode_header returns to us header_filename = _charset_decoder(part.get_filename()) # i.e. some inline attachments have no filename field in the header # so we have to hack around it and get the name field if 'None' in header_filename: header_filename = part.get('Content-Type').split( 'name=')[-1].replace('"', '') elif not header_filename[0][0] or header_filename[0][0] is None: # we should hopefully never reach this, attachments would be 'noname' in gmail header_filename = 'attachment-%06d.data' % (self.seq) self.seq += 1 # sanitize it punct = '!"#$&\'*+/;<>?[\]^`{|}~' header_filename = header_filename.translate(None, punct) # 2012-10-28_19-15-22 (Y-M-D_H-M-S) header_date = parsedate(mail['date']) header_date = '%s-%s-%s_%s-%s-%s_' % (header_date[0], header_date[1], header_date[2], header_date[3], header_date[4], header_date[5]) filename = header_date + header_filename # we should create it in the documents folder username = self._username userdir = os.path.expanduser('~/LostPhotosFound') savepath = os.path.join(userdir, username) if not os.path.isdir(savepath): os.makedirs(savepath) # logging complement print '\t...%s' % (filename) saved = os.path.join(savepath, filename) if not os.path.isfile(saved): with open(saved, 'wb') as imagefile: try: payload = part.get_payload(decode=True) except: message = 'Failed when downloading attachment: %s' % ( saved) raise Exception(message) payload_hash = hashlib.sha1(payload).hexdigest() # gmail loves to duplicate attachments in replies if payload_hash not in self._hashes.keys(): try: imagefile.write(payload) except: message = 'Failed writing attachment to file: %s' % ( saved) raise Exception(message) self._hashes[payload_hash] = payload_hash else: print 'Duplicated attachment %s (%s)' % (saved, payload_hash) os.remove(saved)