Example #1
0
    def test_rotating_log(self):
        """
            test rotating via proxy
        """
        open("/tmp/lg.log", "w").close()
        args={}
        args["file_name"] = "/tmp/lg.log"
        args["rotating"] = True
        args["maxBytes"]=20000
        args["backupCount"]=10
        #args["level"]= logging.INFO
        (my_log,
         logging_mutex) = make_shared_logger_and_proxy (setup_std_shared_logger,
                                                        "my_logger", args)
        with logging_mutex:
            my_log.debug('This is a debug message')
            my_log.info('This is an info message')
            my_log.warning('This is a warning message')
            my_log.error('This is an error message')
            my_log.critical('This is a critical error message')
            my_log.log(logging.ERROR, 'This is a debug message')
        with open("/tmp/lg.log") as ii:
            self.assertTrue(ii.read() == \
"""This is a warning message
This is an error message
This is a critical error message
This is a debug message
""")
Example #2
0
    def test_rotating_log(self):
        """
            test rotating via proxy
        """
        open("/tmp/lg.log", "w").close()
        args = {}
        args["file_name"] = "/tmp/lg.log"
        args["rotating"] = True
        args["maxBytes"] = 20000
        args["backupCount"] = 10
        args["level"]= logging.INFO
        (my_log, logging_mutex) = make_shared_logger_and_proxy(
            setup_std_shared_logger, "my_logger", args)

        with logging_mutex:
            my_log.debug('This is a debug message')
            my_log.info('This is an info message')
            my_log.warning('This is a warning message')
            my_log.error('This is an error message')
            my_log.critical('This is a critical error message')
            my_log.log(logging.ERROR, 'This is a debug message')

        with open("/tmp/lg.log") as ii:
            data = ii.readlines()

            self.assertEqual(data,
                             ["This is an info message\n",
                              "This is a warning message\n",
                              "This is an error message\n",
                              "This is a critical error message\n",
                              "This is a debug message\n"])
Example #3
0
def ruffus_logger(options=None, module_name='pipeline'):
    'creates a shared logger and mutex'
    if options is None:
        options = DefaultLog()
    logger = logging.getLogger(module_name)
    _setup_std_logging(logger, options.log_file, options.verbose)
    def get_logger (logger_name, args):
        return logger
    (logger_proxy,
     logging_mutex) = make_shared_logger_and_proxy (get_logger, module_name, {})
    logger_proxy.log_file = options.log_file
    return logger_proxy, logging_mutex
Example #4
0
def configure(config, args):
    """
    Setup runtime from config module/dict and command line args

    Parameters
    ----------
    config: dict or Namespace
        Hold configurations used to initialize ApusConfig object
    args: list
        list of arguments to be passed to Ruffus.cmdline module

    Returns
    -------
    apusconf: ApusConfig
        Hold configurations of the Apus
    option: Namespace
        Hold parsed command line arguments
    """
    if isinstance(config, dict):
        apusconf = ApusConfig(**config)
    else:
        apusconf = ApusConfig(config=config)

    parser = cmdline.get_argparse(description="""
+- Astronomy Pipeline Using ruffuS, specifically tweaked for PostCalib -+
""",
                                  version=ruffus.__version__,
                                  prog='postcalib run ... -a ')
    parser.add_argument('-r',
                        '--redo-all',
                        action='store_true',
                        help='force redo all tasks')
    parser.add_argument('-l',
                        '--list-tasks',
                        action='store_true',
                        help='list the task names and exit')

    parser.set_defaults(verbose=[
        '0',
    ],
                        log_file=os.path.join(apusconf.logdir,
                                              apusconf.log_file),
                        history_file=os.path.join(apusconf.logdir,
                                                  apusconf.history_file))
    option = parser.parse_args(args)
    # handle logger
    logger, logger_mutex = make_shared_logger_and_proxy(
        logger_factory, apusconf.jobkey, [option.log_file, option.verbose])
    apusconf.logger = logger
    apusconf.logger_mutex = logger_mutex
    return apusconf, option
def ruffus_logger(options=None, module_name='pipeline'):
    'creates a shared logger and mutex'
    if options is None:
        options = DefaultLog()
    logger = logging.getLogger(module_name)
    _setup_std_logging(logger, options.log_file, options.verbose)

    def get_logger(logger_name, args):
        return logger

    (logger_proxy,
     logging_mutex) = make_shared_logger_and_proxy(get_logger, module_name, {})
    logger_proxy.log_file = options.log_file
    return logger_proxy, logging_mutex
Example #6
0
    def get_logger(self, logger_name, log_file):
        '''
        Returns a shared logger and proxy
        '''
        # the log file should be this format:
        # '/<project_path>/<pipeline_name>_<run_id>.log'

        logger_args                = {}
        logger_args["file_name"]   = log_file
        logger_args["level"]       = logging.DEBUG
        logger_args["rotating"]    = True
        logger_args["maxBytes"]    = 10000000
        logger_args["backupCount"] = 10
        logger_args["formatter"]   = "[%(asctime)s] [%(name)s] [%(levelname)s]:\t%(message)s"

        logger_proxy, logger_mutex = make_shared_logger_and_proxy (setup_std_shared_logger,
                                                                   logger_name,
                                                                   logger_args)

        return [logger_proxy, logger_mutex]
Example #7
0
    def get_logger(self, logger_name, log_file):
        '''
        Returns a shared logger and proxy
        '''
        # the log file should be this format:
        # '/<project_path>/<pipeline_name>_<run_id>.log'

        logger_args = {}
        logger_args["file_name"] = log_file
        logger_args["level"] = logging.DEBUG
        logger_args["rotating"] = True
        logger_args["maxBytes"] = 10000000
        logger_args["backupCount"] = 10
        logger_args[
            "formatter"] = "[%(asctime)s] [%(name)s] [%(levelname)s]:\t%(message)s"

        logger_proxy, logger_mutex = make_shared_logger_and_proxy(
            setup_std_shared_logger, logger_name, logger_args)

        return [logger_proxy, logger_mutex]
Example #8
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    _log.debug('ocrmypdf ' + VERSION)
    _log.debug('tesseract ' + tesseract.version())

    check_options(options, _log)

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(
            work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(
            work_folder, 'origin')

        if options.input_file == '-':
            # stdin
            _log.info('reading file from standard input')
            with open(start_input_file, 'wb') as stream_buffer:
                from shutil import copyfileobj
                copyfileobj(sys.stdin.buffer, stream_buffer)
        else:
            try:
                re_symlink(options.input_file, start_input_file, _log)
            except FileNotFoundError:
                _log.error("File not found - " + options.input_file)
                return ExitCode.input_file

        if options.output_file == '-':
            if sys.stdout.isatty():
                _log.error(textwrap.dedent("""\
                    Output was set to stdout '-' but it looks like stdout
                    is connected to a terminal.  Please redirect stdout to a
                    file."""))
                return ExitCode.bad_args
        elif not is_file_writable(options.output_file):
            _log.error(textwrap.dedent("""\
                Output file location is not writable."""))
            return ExitCode.file_access_error

        manager = JobContextManager()
        manager.register('JobContext', JobContext)
        manager.start()

        context = manager.JobContext()
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        else:
            return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
    elif options.output_file != '-':
        if options.output_type == 'pdfa':
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))

                return ExitCode.invalid_output_pdf
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf
    else:
        _log.info("Output sent to stdout")

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))
    direction = {0: 'n', 90: 'e',
                 180: 's', 270: 'w'}
    orientations = []
    for n, page in enumerate(pdfinfo):
        angle = pdfinfo[n].rotation or 0
        if angle != 0:
            orientations.append('{0}{1}'.format(
                n + 1,
                direction.get(angle, '')))
    if orientations:
        _log.info('Page orientations detected: ' + ' '.join(orientations))

    return ExitCode.ok
Example #9
0
    root_logger = logging.getLogger(logger_name)
    root_logger.setLevel(logging.DEBUG)

    handler = logging.StreamHandler(sys.stderr)
    formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
    handler.setFormatter(formatter_)
    if verbose:
        handler.setLevel(logging.DEBUG)
    else:
        handler.setLevel(logging.INFO)
    root_logger.addHandler(handler)
    return root_logger


_logger, _logger_mutex = proxy_logger.make_shared_logger_and_proxy(
    logging_factory, __name__, [None, options.verbose])


class WrappedLogger:
    def __init__(self, my_logger, my_mutex):
        self.logger = my_logger
        self.mutex = my_mutex

    def log(self, *args, **kwargs):
        with self.mutex:
            self.logger.log(*args, **kwargs)

    def debug(self, *args, **kwargs):
        with self.mutex:
            self.logger.debug(*args, **kwargs)
Example #10
0
    log_file_name, verbose = listargs

    root_logger = logging.getLogger(logger_name)
    root_logger.setLevel(logging.DEBUG)

    handler = logging.StreamHandler(sys.stderr)
    formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
    handler.setFormatter(formatter_)
    if verbose:
        handler.setLevel(logging.DEBUG)
    else:
        handler.setLevel(logging.INFO)
    root_logger.addHandler(handler)
    return root_logger

_logger, _logger_mutex = proxy_logger.make_shared_logger_and_proxy(
    logging_factory, __name__, [None, options.verbose])


class WrappedLogger:

    def __init__(self, my_logger, my_mutex):
        self.logger = my_logger
        self.mutex = my_mutex

    def log(self, *args, **kwargs):
        with self.mutex:
            self.logger.log(*args, **kwargs)

    def debug(self, *args, **kwargs):
        with self.mutex:
            self.logger.debug(*args, **kwargs)
Example #11
0
from ruffus import pipeline_run, pipeline_printout, Pipeline, parallel, proxy_logger


def logging_factory(logger_name, listargs):
    root_logger = logging.getLogger(logger_name)
    root_logger.setLevel(logging.DEBUG)

    handler = logging.StreamHandler(sys.stderr)
    formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
    handler.setFormatter(formatter_)
    handler.setLevel(logging.INFO)
    root_logger.addHandler(handler)
    return root_logger


log, log_mutex = proxy_logger.make_shared_logger_and_proxy(
    logging_factory, __name__, [])

#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks

#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888


@parallel([['A', 1], ['B', 3], ['C', 3], ['D', 4], ['E', 4], ['F', 4]])
def parallel_task(name, param1):
    sys.stderr.write("    Parallel task %s: \n" % name)
    #raise task.JobSignalledBreak("Oops! I did it again!")
    with log_mutex:
        log.info("    Raising exception")
    raise Exception("new")
Example #12
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1
    print("Inside of options is: " + options)

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    preamble(_log)
    check_options(options, _log)

    # Complain about qpdf version < 7.0.0
    # Suppress the warning if in the test suite, since there are no PPAs
    # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis)
    if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'):
        complain(
            "You are using qpdf version {0} which has known issues including "
            "security vulnerabilities with certain malformed PDFs. Consider "
            "upgrading to version 7.0.0 or newer.".format(qpdf.version()))

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()

    # Performance is improved by setting Tesseract to single threaded. In tests
    # this gives better throughput than letting a smaller number of Tesseract
    # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
    # variable, but harmless to set if ignored.
    os.environ.setdefault('OMP_THREAD_LIMIT', '1')

    check_environ(options, _log)
    if os.environ.get('PYTEST_CURRENT_TEST'):
        os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file

    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(
            work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(
            work_folder, 'origin')

        check_input_file(options, _log, start_input_file)
        check_requested_output_file(options, _log)

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to
        exceptions = e.job_exceptions
        exitcode = traverse_ruffus_exception(exceptions, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(str(e))
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
        return ExitCode.ok
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))
                return ExitCode.pdfa_conversion_failed
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

        report_output_file_size(options, _log, start_input_file,
                                options.output_file)

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
Example #13
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    _log.debug('ocrmypdf ' + VERSION)
    _log.debug('tesseract ' + tesseract.version())
    _log.debug('qpdf ' + qpdf.version())

    check_options(options, _log)

    PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1000000)
    if PIL.Image.MAX_IMAGE_PIXELS == 0:
        PIL.Image.MAX_IMAGE_PIXELS = None

    # Complain about qpdf version < 7.0.0
    # Suppress the warning if in the test suite, since there are no PPAs
    # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis)
    if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'):
        complain(
            "You are using qpdf version {0} which has known issues including "
            "security vulnerabilities with certain malformed PDFs. Consider "
            "upgrading to version 7.0.0 or newer.".format(qpdf.version()))

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(work_folder,
                                            'ruffus_history.sqlite')
        start_input_file = os.path.join(work_folder, 'origin')

        if options.input_file == '-':
            # stdin
            _log.info('reading file from standard input')
            with open(start_input_file, 'wb') as stream_buffer:
                from shutil import copyfileobj
                copyfileobj(sys.stdin.buffer, stream_buffer)
        else:
            try:
                re_symlink(options.input_file, start_input_file, _log)
            except FileNotFoundError:
                _log.error("File not found - " + options.input_file)
                return ExitCode.input_file

        if options.output_file == '-':
            if sys.stdout.isatty():
                _log.error(
                    textwrap.dedent("""\
                    Output was set to stdout '-' but it looks like stdout
                    is connected to a terminal.  Please redirect stdout to a
                    file."""))
                return ExitCode.bad_args
        elif not is_file_writable(options.output_file):
            _log.error("Output file location (" + options.output_file + ") " +
                       "is not a writable file.")
            return ExitCode.file_access_error

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))
                return ExitCode.invalid_output_pdf
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
Example #14
0
def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args)
    _log.debug('ocrmypdf ' + VERSION)
    _log.debug('tesseract ' + tesseract.version())
    _log.debug('qpdf ' + qpdf.version())

    check_options(options, _log)

    PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1000000)
    if PIL.Image.MAX_IMAGE_PIXELS == 0:
        PIL.Image.MAX_IMAGE_PIXELS = None

    # Complain about qpdf version < 7.0.0
    # Suppress the warning if in the test suite, since there are no PPAs
    # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis)
    if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'):
        complain(
            "You are using qpdf version {0} which has known issues including "
            "security vulnerabilities with certain malformed PDFs. Consider "
            "upgrading to version 7.0.0 or newer.".format(qpdf.version()))

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(
            work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(
            work_folder, 'origin')

        if options.input_file == '-':
            # stdin
            _log.info('reading file from standard input')
            with open(start_input_file, 'wb') as stream_buffer:
                from shutil import copyfileobj
                copyfileobj(sys.stdin.buffer, stream_buffer)
        else:
            try:
                re_symlink(options.input_file, start_input_file, _log)
            except FileNotFoundError:
                _log.error("File not found - " + options.input_file)
                return ExitCode.input_file

        if options.output_file == '-':
            if sys.stdout.isatty():
                _log.error(textwrap.dedent("""\
                    Output was set to stdout '-' but it looks like stdout
                    is connected to a terminal.  Please redirect stdout to a
                    file."""))
                return ExitCode.bad_args
        elif not is_file_writable(options.output_file):
            _log.error(
                "Output file location (" + options.output_file + ") " +
                "is not a writable file.")
            return ExitCode.file_access_error

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to

        # Ruffus flattens exception to 5 element tuples. Because of a bug
        # in <= 2.6.3 it may present either the single:
        #   (task, job, exc, value, stack)
        # or something like:
        #   [[(task, job, exc, value, stack)]]
        #
        # Generally cross-process exception marshalling doesn't work well
        # and ruffus doesn't support because BaseException has its own
        # implementation of __reduce__ that attempts to reconstruct the
        # exception based on e.__init__(e.args).
        #
        # Attempting to log the exception directly marshalls it to the logger
        # which is probably in another process, so it's better to log only
        # data from the exception at this point.

        exitcode = traverse_ruffus_exception(e.args, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(e)
        return ExitCode.other_error

    if options.flowchart:
        _log.info("Flowchart saved to {}".format(options.flowchart))
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = 'Output file is a {} (as expected)'
                _log.info(msg.format(pdfa_info['conformance']))
            else:
                msg = 'Output file is okay but is not PDF/A (seems to be {})'
                _log.warning(msg.format(pdfa_info['conformance']))
                return ExitCode.invalid_output_pdf
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
Example #15
0
def run_pipeline(args=None):
    options = parser.parse_args(args=args)
    options.verbose_abbreviated_path = 1
    if os.environ.get('_OCRMYPDF_THREADS'):
        options.use_threads = True

    if not check_closed_streams(options):
        return ExitCode.bad_args

    logger_args = {'verbose': options.verbose, 'quiet': options.quiet}

    _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
        logging_factory, __name__, logger_args
    )
    preamble(_log)
    check_options(options, _log)
    check_dependency_versions(options, _log)

    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()

    # Performance is improved by setting Tesseract to single threaded. In tests
    # this gives better throughput than letting a smaller number of Tesseract
    # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
    # variable, but harmless to set if ignored.
    os.environ.setdefault('OMP_THREAD_LIMIT', '1')

    check_environ(options, _log)
    if os.environ.get('PYTEST_CURRENT_TEST'):
        os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file

    try:
        work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
        options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
        start_input_file = os.path.join(work_folder, 'origin')

        check_input_file(options, _log, start_input_file)
        check_requested_output_file(options, _log)

        manager = JobContextManager()
        manager.register('JobContext', JobContext)  # pylint: disable=no-member
        manager.start()

        context = manager.JobContext()  # pylint: disable=no-member
        context.set_options(options)
        context.set_work_folder(work_folder)

        build_pipeline(options, work_folder, _log, context)
        atexit.register(cleanup_working_files, work_folder, options)
        if hasattr(os, 'nice'):
            os.nice(5)
        cmdline.run(options)
    except ruffus_exceptions.RethrownJobError as e:
        if options.verbose:
            _log.debug(str(e))  # stringify exception so logger doesn't have to
        exceptions = e.job_exceptions
        exitcode = traverse_ruffus_exception(exceptions, options, _log)
        if exitcode is None:
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
        _log.error(str(e))
        return ExitCode.other_error

    if options.flowchart:
        _log.info(f"Flowchart saved to {options.flowchart}")
        return ExitCode.ok
    elif options.output_file == '-':
        _log.info("Output sent to stdout")
    elif os.path.samefile(options.output_file, os.devnull):
        pass  # Say nothing when sending to dev null
    else:
        if options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
                _log.info(msg)
            else:
                msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
                _log.warning(msg)
                return ExitCode.pdfa_conversion_failed
        if not qpdf.check(options.output_file, _log):
            _log.warning('Output file: The generated PDF is INVALID')
            return ExitCode.invalid_output_pdf

        report_output_file_size(options, _log, start_input_file, options.output_file)

    pdfinfo = context.get_pdfinfo()
    if options.verbose:
        from pprint import pformat

        _log.debug(pformat(pdfinfo))

    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok
Example #16
0
    The only way around this is to only make calls to multiprocessing
    (i.e. make_shared_logger_and_proxy(...)) after the import phase of
    module loading.

    This python bug will be triggered if your make_shared_logger_and_proxy()
    call is at global scope in a module (i.e. not __main__) and only for
    python version 3.2

    888888888888888888888888888888888888888888888888888888888888888888888888888

""")
    sys.exit()

(logger_proxy,
 logging_mutex) = make_shared_logger_and_proxy(setup_std_shared_logger,
                                               "my_logger", args)


#
#    task1
#
@originate(input_file_names, logger_proxy, logging_mutex)
def task1(outfile, logger_proxy, logging_mutex):
    write_input_output_filenames_to_output(None, outfile, logger_proxy,
                                           logging_mutex)


#
#    task2
#
@transform(task1, suffix(".1"), ".2", logger_proxy, logging_mutex)
Example #17
0
    The only way around this is to only make calls to multiprocessing
    (i.e. make_shared_logger_and_proxy(...)) after the import phase of
    module loading.

    This python bug will be triggered if your make_shared_logger_and_proxy()
    call is at global scope in a module (i.e. not __main__) and only for
    python version 3.2

    888888888888888888888888888888888888888888888888888888888888888888888888888

""")
    sys.exit()

(logger_proxy,
 logging_mutex) = make_shared_logger_and_proxy(setup_std_shared_logger,
                                               "my_logger", args)


#
#    task1
#
@originate(input_file_names, logger_proxy, logging_mutex)
def task1(outfile, logger_proxy, logging_mutex):
    write_input_output_filenames_to_output(
        None, outfile, logger_proxy, logging_mutex)


#
#    task2
#
@transform(task1, suffix(".1"), ".2", logger_proxy, logging_mutex)
Example #18
0
import ruffus
from ruffus import pipeline_run, pipeline_printout, Pipeline, parallel, proxy_logger


def logging_factory(logger_name, listargs):
    root_logger = logging.getLogger(logger_name)
    root_logger.setLevel(logging.DEBUG)

    handler = logging.StreamHandler(sys.stderr)
    formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
    handler.setFormatter(formatter_)
    handler.setLevel(logging.INFO)
    root_logger.addHandler(handler)
    return root_logger

log, log_mutex = proxy_logger.make_shared_logger_and_proxy(
    logging_factory, __name__, [])


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

#   Tasks


#88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888

@parallel([['A', 1], ['B',3], ['C',3], ['D',4], ['E',4], ['F',4]])
def parallel_task(name, param1):
    sys.stderr.write("    Parallel task %s: \n" % name)
    #raise task.JobSignalledBreak("Oops! I did it again!")
    with log_mutex:
        log.info("    Raising exception")