Esempio n. 1
0
def main(loop: asyncio.AbstractEventLoop) -> None:
    telethon, pytonisadb, rabbitmq, pytonisa_files = loop.run_until_complete(
        asyncio.gather(
            start_telethon(),
            start_pytonisadb(),
            start_rabbitmq(loop),
            start_pytonisa_file_storage(),
        ))

    queuehandlers.telegram = telethon
    queuehandlers.rabbitmq = rabbitmq
    queuehandlers.pytonisadb = pytonisadb
    queuehandlers.pytonisa_files = pytonisa_files
    messagehandlers.rabbitmq = rabbitmq
    messagehandlers.pytonisadb = pytonisadb
    messagehandlers.pytonisa_files = pytonisa_files

    log.info('Bot initiated')

    try:
        loop.run_forever()
    except KeyboardInterrupt:
        loop.run_until_complete(
            asyncio.gather(
                exit_telethon(telethon),
                exit_rabbitmq(rabbitmq),
                exit_pytonisadb(pytonisadb),
                exit_pytonisa_file_storage(pytonisa_files),
            ))
Esempio n. 2
0
async def on_document_error(message: IncomingMessage):
    if pytonisadb is None:
        log.warn(
            'on_document_error called before database is ready, sleeping 10 seconds')
        await asyncio.sleep(10)
        await message.nack()
        return

    ocr_request_id = message.body.decode()
    log.info('Sending error for document of id ' + ocr_request_id)

    document = pytonisadb.ocr_requests.get_item(ocr_request_id)
    queue_message = QueueMessage(**document)
    queue_message.ocr_args = OcrMyPdfArgs(**queue_message.ocr_args)

    await telegram.send_message(
        entity=queue_message.chat_id,
        message='Infelizmente não foi possível reconhecer seu pdf. O(s) seguinte(s) erro(s) ocorreu(ram):',
        reply_to=queue_message.message_id,
    )
    await telegram.send_message(
        entity=queue_message.chat_id,
        message='- ' + '\n- '.join(queue_message.errors)
    )

    await message.ack()
Esempio n. 3
0
async def pdf_to_ocr(event: events.newmessage.NewMessage.Event) -> None:
    """Handles messages for applying ocr to a pdf

    This function handles incoming new messages that respects the 
    pattern '(^-)|(^$)' (messages that are empty or starts with -)
    and have an attached pdf file.

    Args:
        event (`events.newmessage.NewMessage.Event`):
            The new message event (from telethon)
    """

    message_obj: custom.message.Message = event.message

    log.info('-' * 20 + 'pdf_to_ocr called' + '-' * 20)
    await message_obj.reply('Arquivo recebido!')

    default_args = OcrMyPdfArgs(arg_string=message_obj.message)
    log.info('Language set to: ' + ' '.join(default_args.language))

    file_path: str = os.path.join(pytonisa_files.get_valid_path(),
                                  message_obj.file.name)
    file_path = await message_obj.download_media(file=file_path)
    pytonisa_files.upload_file(file_path)

    channel: Channel = rabbitmq['channel']
    queue_message = QueueMessage(os.path.basename(file_path),
                                 message_obj.chat_id, message_obj.id,
                                 default_args)

    dicti: dict = queue_message.__dict__
    dicti['ocr_args'] = dicti['ocr_args'].__dict__
    result: dict = pytonisadb.ocr_requests.put_item(dicti)

    objectId: str = result['_id']

    log.info(f'document of id {objectId} created')

    encoded_id = bytes(objectId, 'utf-8')
    await channel.default_exchange.publish(Message(encoded_id),
                                           routing_key=Queues.TO_PROCESS.value)

    log.info('Arquivo inserido na fila para processamento')
    await message_obj.respond('Arquivo inserido na fila para processamento')
    log.info('Finalizado')
Esempio n. 4
0
async def on_document_processed(message: IncomingMessage):
    log.info('-'*20 + 'on_document_processed called' + '-'*20)

    if pytonisadb is None:
        log.warn(
            'on_document_processed called before database is ready, sleeping 10 seconds')
        await asyncio.sleep(10)
        await message.nack()
        return

    ocr_request_id = message.body.decode()
    log.info('Sending processed document of id ' + ocr_request_id)

    document: dict = pytonisadb.ocr_requests.get_item(ocr_request_id)
    queue_message = QueueMessage(**document)
    queue_message.ocr_args = OcrMyPdfArgs(**queue_message.ocr_args)

    output_file = pytonisa_files.download_file(queue_message.file_name)

    await telegram.send_message(
        entity=queue_message.chat_id,
        message='OCR feito! Estamos fazendo upload do seu arquivo',
    )
    with open(output_file, 'rb') as file:
        await telegram.send_message(
            entity=queue_message.chat_id,
            message='Aqui está!',
            reply_to=queue_message.message_id,
            file=file,
        )

    log.info('File sent')

    await message.ack()
Esempio n. 5
0
def main() -> None:
    rabbitmq, pytonisadb, pytonisa_files = start_rabbitmq(), start_pytonisadb(
    ), start_pytonisa_file_storage()

    queuehandler.rabbitmq = rabbitmq
    queuehandler.pytonisadb = pytonisadb
    queuehandler.pytonisa_files = pytonisa_files

    log.info('ocrmypdf processor initiated')

    try:
        channel: BlockingChannel = rabbitmq['channel']
        channel.start_consuming()
    except KeyboardInterrupt:
        log.info('ending ocrmypdf')
        exit_rabbitmq(rabbitmq), exit_pytonisadb(
            pytonisadb), exit_pytonisa_file_storage(pytonisa_files)

        for thread in threads:
            thread.join()
        log.info('ocrmypdf ended')
Esempio n. 6
0
def on_document_to_process(channel: BlockingChannel, method: Basic.Deliver,
                           properties: BasicProperties, body: Union[str,
                                                                    bytes]):
    connection: BlockingConnection = rabbitmq['connection']

    ocr_request_id = body.decode()

    handle_error_partial: function = partial(handle_error,
                                             channel=channel,
                                             ocr_request_id=ocr_request_id,
                                             delivery_tag=method.delivery_tag)

    log.info('-' * 20 + ocr_request_id + '-' * 20)
    log.info('Processing document of id ' + ocr_request_id)

    document: dict = pytonisadb.ocr_requests.get_item(ocr_request_id)
    queue_message: QueueMessage = QueueMessage(**document)
    queue_message.ocr_args = OcrMyPdfArgs(**queue_message.ocr_args)

    if queue_message.started_processing:
        handle_error_partial(
            message=
            'Tentando processar um item repetido, provavelmente o servidor crashou no reconhecimento OCR anterior'
        )
        return

    queue_message.started_processing = True
    pytonisadb.ocr_requests.update_item(ocr_request_id,
                                        {'started_processing': True})

    input_file = pytonisa_files.download_file(queue_message.file_name)
    output_file = input_file

    log.info('Iniciando processamento OCR')

    try:
        ocr_args = queue_message.ocr_args.__dict__
        ocrmypdf.ocr(input_file=input_file,
                     output_file=output_file,
                     **ocr_args)
    except ocrmypdf.PriorOcrFoundError:
        log.info('Arquivo já possui OCR')

        queue_message.ocr_args.set_force_ocr()
        pytonisadb.ocr_requests.update_item(
            ocr_request_id, {'ocr_args': queue_message.ocr_args.__dict__})

        ocr_args = queue_message.ocr_args.__dict__
        ocrmypdf.ocr(input_file=input_file,
                     output_file=output_file,
                     **ocr_args)
    except ocrmypdf.MissingDependencyError as mde:
        handle_error_partial(
            message='Não foi possível processar alguma das línguas solicitadas',
            e=mde,
        )
        return
    except Exception as e:
        handle_error_partial(
            message='Ocorreu um erro desconhecido',
            e=e,
        )
        return

    pytonisa_files.upload_file(output_file)

    queue_message.processed = True
    ocr_args = queue_message.ocr_args.__dict__
    pytonisadb.ocr_requests.update_item(ocr_request_id, {
        'processed': True,
        'ocr_args': ocr_args
    })

    log.info('Processamento OCR finalizado com sucesso!')

    cb = partial(ack_message,
                 delivery_tag=method.delivery_tag,
                 routing_key=Queues.PROCESSED.value,
                 message=body)
    connection.add_callback_threadsafe(cb)