def check_mailbox(account): address = account['address'] password = account['password'] imap_hostname = account['imap_hostname'] while True: try: logger.debug( 'Trying to connect to {} via IMAP'.format(imap_hostname) ) mb = imaplib.IMAP4_SSL(imap_hostname) mb.login( address, password ) logger.debug( 'Logged into IMAP server {hostname} as {address}'.format( hostname=imap_hostname, address=address ) ) mb.select() while True: result, data = mb.uid('search', None, '(Unseen)') if data[0]: logger.debug('We have some unseen letters in {} mailbox'.format(address)) for mail_uid in data[0].split(): put_to_queue( 'LETTERS', (mail_uid, account) ) mb.uid('store', mail_uid, '+FLAGS', '\\Seen') logger.debug( 'Latter {uid} ({address}) have been putted into the queue and marked as seen'.format( uid=mail_uid, address=address ) ) time.sleep(sleep_seconds) except Exception as e: logger.warning( "Exception: {}\nType of error: {}\n".format(e, e.__class__) )
def process(in_item): input_file = in_item['input_file'] if not 'output_format' in in_item: input_ext = os.path.splitext(input_file)[1] if input_ext in format_to_format: in_item['output_format'] = format_to_format[input_ext] else: in_item['output_format'] = None if in_item['output_format'] in processors and os.path.exists( in_item['input_file'] ): logger.debug('Start processing {input} to {output_format}'.format( input=input_file, output_format=in_item['output_format'] ) ) out_item = processors[ in_item['output_format'] ](in_item) logger.debug('{input} converted to {output}'.format( input=input_file, output=out_item['output_file'] ) ) else: out_item = in_item.copy() out_item['processed_successfully'] = False logger.warning("{input} cannot be converted: there isn't processor for {output_format} or file doesn't exist".format( input=input_file, output_format=in_item['output_format'] ) ) out_item['process_time'] = time.time() put_to_queue('OUT', out_item) os.remove(in_item['input_file']) logger.debug('Work was putted to "OUT" queue and {input} was deleted'.format( input=in_item['input_file'] ) )
def process_letter(mail_uid, account): raw_mail = download_letter(mail_uid, account) message = email.message_from_bytes(raw_mail) address = re.search("<?(.+\@.+)>?", message["From"]).group(1) user = get_user_by_email(address) item_generic = user.copy() # Base for item object that will be added to INqueue item_generic["collector"] = "imap" item_generic["email_To"] = account["address"] item_generic["email_From"] = address item_generic["email_Subject"] = decode_simple_header(message["Subject"]) if "Subject" in message else "" item_generic["email_Message-ID"] = message["Message-ID"] try: item_generic["output_format"] = "." + re.search("^(test_)?\w*?2?(\w+)@", account["address"]).group(2) except IndexError: pass user_local_path = in_user_local_path(item_generic) logger.debug( "Look for attachments in letter {uid} from {address} to {acc_address}".format( uid=mail_uid, address=address, acc_address=account["address"] ) ) body = "" skip_text_maintype = ( False ) # Help to find main text body of the message. I resume that the first part with the text maintype is the necessary one for part in message.walk(): if not skip_text_maintype and part.get_content_maintype() == "text": body += part.get_payload() skip_text_maintype = True continue if part.get_content_maintype() == "multipart" or not part.get_filename(): continue filename = os.path.basename( decode_simple_header(part.get_filename()) ) # os.path.basename is for security reason local_file = os.path.join(user_local_path, filename) with open(local_file, "wb") as fh: fh.write(part.get_payload(decode=True)) item = item_generic.copy() item["collect_time"] = time.time() item["input_file"] = local_file put_to_queue("IN", item) logger.debug( 'File {input_file} saved from email attachment and putted to the "IN" queue (letter {uid} from {address} to {acc_address}'.format( input_file=item["input_file"], uid=mail_uid, address=address, acc_address=account["address"] ) ) logger.debug( "Look for urls in letter {uid} from {address} to {acc_address}".format( uid=mail_uid, address=address, acc_address=account["address"] ) ) try: body_in_xmltag = "<nothing>" + body + "</nothing>" body_without_xmltags = " ".join(ElementTree.fromstring(body_in_xmltag).itertext()) except ElementTree.ParseError: body_without_xmltags = body urls = set(re.findall(r"https?://[^ ]+", body)) for url in urls: request = requests.get(url, stream=True) try: # Try to get filename from HTTP headers filename = os.path.basename( re.search('^filename="(.+)"$', request.headers["content-disposition"]).group(1) ) # os.path.basename is for security reason # AttributeError — There isn't Content-Disposition HTTP header in HTTP response # KeyError — filename is empty # IndexError — Content-Disposition HTTP header doesn't have filename parameter: except (AttributeError, KeyError, IndexError): unquoted_url = urllib.parse.unquote_plus(url) filename = urllib.parse.urlparse(unquoted_url).path.split("/")[-1] # Get filename from url logger.debug("filename = {}".format(filename)) if filename == "": filename = "index.html" if not os.path.splitext(filename): filename += ".html" local_file = os.path.join(user_local_path, filename) with open(local_file, "wb") as fh: for chunk in request.iter_content(4096): # 4096 is a random count of bytes fh.write(chunk) item = item_generic.copy() item["collect_time"] = time.time() item["input_file"] = local_file put_to_queue("IN", item) logger.debug( 'File {input_file} downloaded through URL from letter {uid} (from {address} to {acc_address}) and putted to the "IN" queue'.format( input_file=item["input_file"], uid=mail_uid, address=address, acc_address=account["address"] ) ) logger.debug( "Processing of letter {uid} is completed (from {address} to {acc_address})".format( uid=mail_uid, address=address, acc_address=account["address"] ) )