Beispiel #1
0
def check_mailbox(account):
    address = account['address']
    password = account['password']
    imap_hostname = account['imap_hostname']
    while True:
        try:
            logger.debug( 'Trying to connect to {} via IMAP'.format(imap_hostname) )
            mb = imaplib.IMAP4_SSL(imap_hostname)
            mb.login( address, password )
            logger.debug( 'Logged into IMAP server {hostname} as {address}'.format(
                hostname=imap_hostname,
                address=address
            ) )
            mb.select()
            while True:
                result, data = mb.uid('search', None, '(Unseen)')
                if data[0]:
                    logger.debug('We have some unseen letters in {} mailbox'.format(address))
                    for mail_uid in data[0].split():
                        put_to_queue( 'LETTERS', (mail_uid, account) )
                        mb.uid('store', mail_uid, '+FLAGS', '\\Seen')
                        logger.debug(
                            'Latter {uid} ({address}) have been putted into the queue and marked as seen'.format(
                                uid=mail_uid,
                                address=address
                            )
                        )
                time.sleep(sleep_seconds)
        except Exception as e:
            logger.warning( "Exception: {}\nType of error: {}\n".format(e, e.__class__) )
Beispiel #2
0
def process(in_item):
    input_file = in_item['input_file']
    if not 'output_format' in in_item:
        input_ext = os.path.splitext(input_file)[1]
        if input_ext in format_to_format:
            in_item['output_format'] = format_to_format[input_ext]
        else:
            in_item['output_format'] = None

    if in_item['output_format'] in processors  and  os.path.exists( in_item['input_file'] ):
        logger.debug('Start processing {input} to {output_format}'.format(
            input=input_file,
            output_format=in_item['output_format']
        ) )
        out_item = processors[ in_item['output_format'] ](in_item)
        logger.debug('{input} converted to {output}'.format(
            input=input_file,
            output=out_item['output_file']
        ) )
    else:
        out_item = in_item.copy()
        out_item['processed_successfully'] = False
        logger.warning("{input} cannot be converted: there isn't processor for {output_format} or file doesn't exist".format(
            input=input_file,
            output_format=in_item['output_format']
        ) )

    out_item['process_time'] = time.time()
    
    put_to_queue('OUT', out_item)
    os.remove(in_item['input_file'])
	
    logger.debug('Work was putted to "OUT" queue and {input} was deleted'.format(
        input=in_item['input_file']
    ) )
Beispiel #3
0
def process_letter(mail_uid, account):
    raw_mail = download_letter(mail_uid, account)
    message = email.message_from_bytes(raw_mail)

    address = re.search("<?(.+\@.+)>?", message["From"]).group(1)
    user = get_user_by_email(address)

    item_generic = user.copy()  # Base for item object that will be added to INqueue
    item_generic["collector"] = "imap"
    item_generic["email_To"] = account["address"]
    item_generic["email_From"] = address
    item_generic["email_Subject"] = decode_simple_header(message["Subject"]) if "Subject" in message else ""
    item_generic["email_Message-ID"] = message["Message-ID"]
    try:
        item_generic["output_format"] = "." + re.search("^(test_)?\w*?2?(\w+)@", account["address"]).group(2)
    except IndexError:
        pass

    user_local_path = in_user_local_path(item_generic)

    logger.debug(
        "Look for attachments in letter {uid} from {address} to {acc_address}".format(
            uid=mail_uid, address=address, acc_address=account["address"]
        )
    )
    body = ""
    skip_text_maintype = (
        False
    )  # Help to find main text body of the message. I resume that the first part with the text maintype is the necessary one
    for part in message.walk():
        if not skip_text_maintype and part.get_content_maintype() == "text":
            body += part.get_payload()
            skip_text_maintype = True
            continue

        if part.get_content_maintype() == "multipart" or not part.get_filename():
            continue

        filename = os.path.basename(
            decode_simple_header(part.get_filename())
        )  # os.path.basename is for security reason
        local_file = os.path.join(user_local_path, filename)
        with open(local_file, "wb") as fh:
            fh.write(part.get_payload(decode=True))

        item = item_generic.copy()
        item["collect_time"] = time.time()
        item["input_file"] = local_file
        put_to_queue("IN", item)
        logger.debug(
            'File {input_file} saved from email attachment and putted to the "IN" queue (letter {uid} from {address} to {acc_address}'.format(
                input_file=item["input_file"], uid=mail_uid, address=address, acc_address=account["address"]
            )
        )

    logger.debug(
        "Look for urls in letter {uid} from {address} to {acc_address}".format(
            uid=mail_uid, address=address, acc_address=account["address"]
        )
    )
    try:
        body_in_xmltag = "<nothing>" + body + "</nothing>"
        body_without_xmltags = " ".join(ElementTree.fromstring(body_in_xmltag).itertext())
    except ElementTree.ParseError:
        body_without_xmltags = body
    urls = set(re.findall(r"https?://[^ ]+", body))
    for url in urls:
        request = requests.get(url, stream=True)

        try:  # Try to get filename from HTTP headers
            filename = os.path.basename(
                re.search('^filename="(.+)"$', request.headers["content-disposition"]).group(1)
            )  # os.path.basename is for security reason
        # AttributeError — There isn't Content-Disposition HTTP header in HTTP response
        # KeyError — filename is empty
        # IndexError — Content-Disposition HTTP header doesn't have filename parameter:
        except (AttributeError, KeyError, IndexError):
            unquoted_url = urllib.parse.unquote_plus(url)
            filename = urllib.parse.urlparse(unquoted_url).path.split("/")[-1]  # Get filename from url
            logger.debug("filename = {}".format(filename))
            if filename == "":
                filename = "index.html"

        if not os.path.splitext(filename):
            filename += ".html"

        local_file = os.path.join(user_local_path, filename)
        with open(local_file, "wb") as fh:
            for chunk in request.iter_content(4096):  # 4096 is a random count of bytes
                fh.write(chunk)

        item = item_generic.copy()
        item["collect_time"] = time.time()
        item["input_file"] = local_file
        put_to_queue("IN", item)
        logger.debug(
            'File {input_file} downloaded through URL from letter {uid} (from {address} to {acc_address}) and putted to the "IN" queue'.format(
                input_file=item["input_file"], uid=mail_uid, address=address, acc_address=account["address"]
            )
        )

    logger.debug(
        "Processing of letter {uid} is completed (from {address} to {acc_address})".format(
            uid=mail_uid, address=address, acc_address=account["address"]
        )
    )