def run_pipeline(): if not options.jobs or options.jobs == 1: options.jobs = available_cpu_count() try: cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: print(e) # Yuck. Hunt through the ruffus exception to find out what the # return code is supposed to be. for exc in e.args: task_name, job_name, exc_name, exc_value, exc_stack = exc if exc_name == 'builtins.SystemExit': return eval( exc_value, {'ExitCode': ExitCode}, {'exc_value': exc_value}) return ExitCode.other_error if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') return ExitCode.invalid_output_pdfa return ExitCode.ok
def bootstrap(config=None, option=None): """entry point; parse command line argument, create pipeline object, and run it """ print("+- Apus powered by Ruffus ver {0} -+".format(ruffus.__version__)) if config is None: config = sys.modules['__main__'] if option is None: option = sys.argv[1:] config, option = configure(config, option) if option.list_tasks: tlist = config.get_task_names() if not tlist: print("no tasks found") else: for t in tlist: print("{0}".format(t)) sys.exit(0) # set up astromatic config config.am = am.AmConfig(**config.env_overrides) build_pipeline(config) # handle redo-all if option.redo_all: task_list = ruffus.pipeline_get_task_names() option.forced_tasks.extend(task_list) if len(option.forced_tasks) > 0: for t in option.forced_tasks: config.logger.info("forced redo: {0}".format(utils.alert(t))) cmdline.run(option, checksum_level=1)
def main(): '''Initialise the pipeline, then run it''' # Parse command line arguments options = parse_command_line() # Initialise the logger logger = Logger(__name__, options.log_file, options.verbose) # Log the command line used to run the pipeline logger.info(' '.join(sys.argv)) drmaa_session = None try: # Set up the DRMAA session for running cluster jobs import drmaa drmaa_session = drmaa.Session() drmaa_session.initialize() except Exception as e: print("{progname} error using DRMAA library".format(progname=program_name), file=sys.stdout) print("Error message: {msg}".format(msg=e.message, file=sys.stdout)) exit(error_codes.DRMAA_ERROR) # Parse the configuration file, and initialise global state config = Config(options.config) config.validate() state = State(options=options, config=config, logger=logger, drmaa_session=drmaa_session) # Build the pipeline workflow pipeline = make_pipeline(state) # Run (or print) the pipeline cmdline.run(options) if drmaa_session is not None: # Shut down the DRMAA session drmaa_session.exit()
def main(): '''Initialise the pipeline, then run it''' # Parse command line arguments options = parse_command_line() # Initialise the logger logger = Logger(__name__, options.log_file, options.verbose) # Log the command line used to run the pipeline logger.info(' '.join(sys.argv)) drmaa_session = None try: # Set up the DRMAA session for running cluster jobs import drmaa drmaa_session = drmaa.Session() drmaa_session.initialize() except Exception as e: print("{progname} error using DRMAA library".format( progname=program_name), file=sys.stdout) print("Error message: {msg}".format(msg=e.message, file=sys.stdout)) exit(error_codes.DRMAA_ERROR) # Parse the configuration file, and initialise global state config = Config(options.config) config.validate() state = State(options=options, config=config, logger=logger, drmaa_session=drmaa_session) # Build the pipeline workflow pipeline = make_pipeline(state) # Run (or print) the pipeline cmdline.run(options) if drmaa_session is not None: # Shut down the DRMAA session drmaa_session.exit()
def run_pipeline(): if not options.jobs: options.jobs = available_cpu_count() try: options.history_file = os.path.join( work_folder, 'ruffus_history.sqlite') cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to # Ruffus flattens exception to 5 element tuples. Because of a bug # in <= 2.6.3 it may present either the single: # (task, job, exc, value, stack) # or something like: # [[(task, job, exc, value, stack)]] # # Generally cross-process exception marshalling doesn't work well # and ruffus doesn't support because BaseException has its own # implementation of __reduce__ that attempts to reconstruct the # exception based on e.__init__(e.args). # # Attempting to log the exception directly marshalls it to the logger # which is probably in another process, so it's better to log only # data from the exception at this point. exitcode = traverse_ruffus_exception(e.args) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error else: return exitcode except Exception as e: _log.error(e) return ExitCode.other_error if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') return ExitCode.invalid_output_pdfa with _pdfinfo_lock: _log.debug(_pdfinfo) direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'} orientations = [] for n, page in enumerate(_pdfinfo): angle = _pdfinfo[n].get('rotated', 0) if angle != 0: orientations.append('{0}{1}'.format( n + 1, direction.get(angle, ''))) if orientations: _log.info('Page orientations detected: ' + ' '.join(orientations)) return ExitCode.ok
def main(): parser = cmdline.get_argparse(description="Trench Run pipeline") args = parser.parse_args() if args.target_tasks: cmdline.run(args) else: pipeline_run(publish_data)
def run_pipeline(): if not options.jobs: options.jobs = available_cpu_count() try: options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite') cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to # Ruffus flattens exception to 5 element tuples. Because of a bug # in <= 2.6.3 it may present either the single: # (task, job, exc, value, stack) # or something like: # [[(task, job, exc, value, stack)]] # # Generally cross-process exception marshalling doesn't work well # and ruffus doesn't support because BaseException has its own # implementation of __reduce__ that attempts to reconstruct the # exception based on e.__init__(e.args). # # Attempting to log the exception directly marshalls it to the logger # which is probably in another process, so it's better to log only # data from the exception at this point. exitcode = traverse_ruffus_exception(e.args) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error else: return exitcode except Exception as e: _log.error(e) return ExitCode.other_error if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') return ExitCode.invalid_output_pdfa with _pdfinfo_lock: _log.debug(_pdfinfo) direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'} orientations = [] for n, page in enumerate(_pdfinfo): angle = _pdfinfo[n].get('rotated', 0) if angle != 0: orientations.append('{0}{1}'.format(n + 1, direction.get(angle, ''))) if orientations: _log.info('Page orientations detected: ' + ' '.join(orientations)) return ExitCode.ok
def main(program_name, program_version, make_pipeline): '''Initialise the pipeline, then run it''' # Parse command line arguments options = parse_command_line(program_version) # Initialise the logger # logger = Logger(__name__, options.log_file, options.verbose) if options.log_file: logging.basicConfig(filename=options.log_file, level=LOGGING_LEVEL, filemode="a", format="%(asctime)s %(levelname)s - %(message)s", datefmt="%m-%d-%Y %H:%M:%S") logger = logging.getLogger(__name__) # Log the command line used to run the pipeline logger.info("*** rnapipe ***") logger.info(' '.join(sys.argv)) drmaa_session = None try: # Set up the DRMAA session for running cluster jobs import drmaa drmaa_session = drmaa.Session() drmaa_session.initialize() except Exception as e: print("{progname} error using DRMAA library".format( progname=program_name), file=sys.stdout) print("Error message: {msg}".format(msg=e.message, file=sys.stdout)) exit(error_codes.DRMAA_ERROR) # Parse the configuration file, and initialise global state config = Config(options.config) config.validate() state = State(options=options, config=config, logger=logger, drmaa_session=drmaa_session) # Build the pipeline workflow pipeline = make_pipeline(state) # Run (or print) the pipeline cmdline.run(options) if drmaa_session is not None: # Shut down the DRMAA session drmaa_session.exit()
def main(): '''Initialise the pipeline, then run it''' # Parse command line arguments options = parse_command_line() # Initialise the logger logger = Logger(__name__, options.log_file, options.verbose) # Log the command line used to run the pipeline logger.info(' '.join(sys.argv)) # Set up the DRMAA session for running cluster jobs drmaa_session = drmaa.Session() drmaa_session.initialize() # Parse the configuration file, and initialise global state config = Config(options.config) config.validate() state = State(options=options, config=config, logger=logger, drmaa_session=drmaa_session) # Build the pipeline workflow pipeline = make_pipeline(state) # Run (or print) the pipeline cmdline.run(options) # Shut down the DRMAA session drmaa_session.exit()
def run_pipeline(): if not options.jobs: options.jobs = available_cpu_count() try: options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite') cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: print(e) # Yuck. Hunt through the ruffus exception to find out what the # return code is supposed to be. for exc in e.args: task_name, job_name, exc_name, exc_value, exc_stack = exc if exc_name == 'builtins.SystemExit': match = re.search(r"\.(.+?)\)", exc_value) exit_code_name = match.groups()[0] exit_code = getattr(ExitCode, exit_code_name, 'other_error') return exit_code elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError': print(cleanup_ruffus_error_message(exc_value)) return ExitCode.input_file elif exc_name == 'builtins.TypeError': # Even though repair_pdf will fail, ruffus will still try # to call split_pages with no input files, likely due to a bug if task_name == 'split_pages': print("Input file '{0}' is not a valid PDF".format( options.input_file)) return ExitCode.input_file return ExitCode.other_error if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') return ExitCode.invalid_output_pdfa return ExitCode.ok
def run_pipeline(): if not options.jobs or options.jobs == 1: options.jobs = available_cpu_count() try: cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: print(e) # Yuck. Hunt through the ruffus exception to find out what the # return code is supposed to be. for exc in e.args: task_name, job_name, exc_name, exc_value, exc_stack = exc if exc_name == 'builtins.SystemExit': match = re.search(r"\.(.+?)\)", exc_value) exit_code_name = match.groups()[0] exit_code = getattr(ExitCode, exit_code_name, 'other_error') return exit_code elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError': print(cleanup_ruffus_error_message(exc_value)) return ExitCode.input_file elif exc_name == 'builtins.TypeError': # Even though repair_pdf will fail, ruffus will still try # to call split_pages with no input files, likely due to a bug if task_name == 'split_pages': print("Input file '{0}' is not a valid PDF".format( options.input_file)) return ExitCode.input_file return ExitCode.other_error if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') return ExitCode.invalid_output_pdfa return ExitCode.ok
def run_pipeline(): if not options.jobs: options.jobs = available_cpu_count() try: options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite') cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(e) # Yuck. Hunt through the ruffus exception to find out what the # return code is supposed to be. # Ruffus flattens the exception to a string, throwing away all kinds # of helpful details # task_name, job_name - ruffus status # exc_name - class name of exception # exc_value - irritating string that makes impossible to recover # exception object # exc_stack - string that contains traceback of exception for exc in e.args: task_name, job_name, exc_name, exc_value, exc_stack = exc if exc_name == 'builtins.SystemExit': match = re.search(r"\.(.+?)\)", exc_value) exit_code_name = match.groups()[0] exit_code = getattr(ExitCode, exit_code_name, 'other_error') return exit_code elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError': _log.error(cleanup_ruffus_error_message(exc_value)) return ExitCode.input_file elif exc_name == 'builtins.TypeError': # Even though repair_pdf will fail, ruffus will still try # to call split_pages with no input files, likely due to a bug if task_name == 'split_pages': _log.error("Input file '{0}' is not a valid PDF".format( options.input_file)) return ExitCode.input_file elif exc_name == 'subprocess.CalledProcessError': # It's up to the subprocess handler to report something useful msg = "Error occurred while running this command:" _log.error(msg + '\n' + exc_value) return ExitCode.child_process_error elif not options.verbose: _log.error(e) return ExitCode.other_error except Exception as e: _log.error(e) return ExitCode.other_error if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') return ExitCode.invalid_output_pdfa with _pdfinfo_lock: _log.debug(_pdfinfo) direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'} orientations = [] for n, page in enumerate(_pdfinfo): angle = _pdfinfo[n].get('rotated', 0) if angle != 0: orientations.append('{0}{1}'.format(n + 1, direction.get(angle, ''))) if orientations: _log.info('Page orientations detected: ' + ' '.join(orientations)) return ExitCode.ok
args = parser.parse_args() params = get_params(args, args.params) check_params(args, params) logs_dir = args.outdir + '/logs' if not os.path.exists(logs_dir): os.makedirs(logs_dir) log_file = '%s/log.%s.txt' % (logs_dir, datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) logger, logging_mutex = cmdline.setup_logging(__name__, log_file, args.verbose) print 'log_file:', log_file cmdline.run(args) read_pairs = [] if args.fq: read_pairs = format_read_pairs(fqs=args.fq) elif args.fq_list: read_pairs = format_read_pairs(list_file=args.fq_list) history_file = '%s/.ruffus_history.sqlite' % args.outdir bbt_outdir = '%s/bbt_%s' % (args.outdir, get_version('bbt')) assembly_outdir = '%s/rnabloom_%s' % (args.outdir, get_version('rnabloom')) pv_outdir = '%s/pv_%s' % (args.outdir, get_version('pv')) bbt_prefix = bbt_outdir + '/' + args.sample # for determining how many procs/threads to give to each analysis num_analysis = 2
def run_pipeline(): cmdline.run(options, multiprocess=available_cpu_count())
except error_drmaa_job as err: raise Exception("\n".join(map(str, ["Failed to run:", cmd, err, stdout_res, stderr_res]))) with logger_mutex: logger.debug("kallisto worked") if __name__ == '__main__': cmdline.run (options, multithread = options.jobs) drmaa_session.exit() pipeline_printout_graph ("bulk_rna-seq.jpg", "jpg", [trim_fastq,hisat2,star,kallisto,cufflinks,qorts], no_key_legend=True, ignore_upstream_of_target=True, pipeline_name="bulk RNA-seq", user_colour_scheme = { "colour_scheme_index" :2, "Bulk RNA-seq" :{"fontcolor" : '"#FF3232"' }, "Task to run" :{"linecolor" : '"#0044A0"' }, "Final target" :{"fillcolor" : '"#EFA03B"', "fontcolor" : "black", "dashed" : 0 } }) pipeline_printout()
def run_pipeline(args=None): options = parser.parse_args(args=args) options.verbose_abbreviated_path = 1 if os.environ.get('_OCRMYPDF_THREADS'): options.use_threads = True if not check_closed_streams(options): return ExitCode.bad_args logger_args = {'verbose': options.verbose, 'quiet': options.quiet} _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy( logging_factory, __name__, logger_args ) preamble(_log) check_options(options, _log) check_dependency_versions(options, _log) # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() # Performance is improved by setting Tesseract to single threaded. In tests # this gives better throughput than letting a smaller number of Tesseract # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this # variable, but harmless to set if ignored. os.environ.setdefault('OMP_THREAD_LIMIT', '1') check_environ(options, _log) if os.environ.get('PYTEST_CURRENT_TEST'): os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file try: work_folder = mkdtemp(prefix="com.github.ocrmypdf.") options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite') start_input_file = os.path.join(work_folder, 'origin') check_input_file(options, _log, start_input_file) check_requested_output_file(options, _log) manager = JobContextManager() manager.register('JobContext', JobContext) # pylint: disable=no-member manager.start() context = manager.JobContext() # pylint: disable=no-member context.set_options(options) context.set_work_folder(work_folder) build_pipeline(options, work_folder, _log, context) atexit.register(cleanup_working_files, work_folder, options) if hasattr(os, 'nice'): os.nice(5) cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to exceptions = e.job_exceptions exitcode = traverse_ruffus_exception(exceptions, options, _log) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error return exitcode except ExitCodeException as e: return e.exit_code except Exception as e: _log.error(str(e)) return ExitCode.other_error if options.flowchart: _log.info(f"Flowchart saved to {options.flowchart}") return ExitCode.ok elif options.output_file == '-': _log.info("Output sent to stdout") elif os.path.samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: msg = f"Output file is a {pdfa_info['conformance']} (as expected)" _log.info(msg) else: msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})" _log.warning(msg) return ExitCode.pdfa_conversion_failed if not qpdf.check(options.output_file, _log): _log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf report_output_file_size(options, _log, start_input_file, options.output_file) pdfinfo = context.get_pdfinfo() if options.verbose: from pprint import pformat _log.debug(pformat(pdfinfo)) log_page_orientations(pdfinfo, _log) return ExitCode.ok
def run_pipeline(): options = parser.parse_args() options.verbose_abbreviated_path = 1 if not check_closed_streams(options): return ExitCode.bad_args logger_args = {'verbose': options.verbose, 'quiet': options.quiet} _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy( logging_factory, __name__, logger_args) _log.debug('ocrmypdf ' + VERSION) _log.debug('tesseract ' + tesseract.version()) check_options(options, _log) # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() try: work_folder = mkdtemp(prefix="com.github.ocrmypdf.") options.history_file = os.path.join( work_folder, 'ruffus_history.sqlite') start_input_file = os.path.join( work_folder, 'origin') if options.input_file == '-': # stdin _log.info('reading file from standard input') with open(start_input_file, 'wb') as stream_buffer: from shutil import copyfileobj copyfileobj(sys.stdin.buffer, stream_buffer) else: try: re_symlink(options.input_file, start_input_file, _log) except FileNotFoundError: _log.error("File not found - " + options.input_file) return ExitCode.input_file if options.output_file == '-': if sys.stdout.isatty(): _log.error(textwrap.dedent("""\ Output was set to stdout '-' but it looks like stdout is connected to a terminal. Please redirect stdout to a file.""")) return ExitCode.bad_args elif not is_file_writable(options.output_file): _log.error(textwrap.dedent("""\ Output file location is not writable.""")) return ExitCode.file_access_error manager = JobContextManager() manager.register('JobContext', JobContext) manager.start() context = manager.JobContext() context.set_options(options) context.set_work_folder(work_folder) build_pipeline(options, work_folder, _log, context) atexit.register(cleanup_working_files, work_folder, options) cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to # Ruffus flattens exception to 5 element tuples. Because of a bug # in <= 2.6.3 it may present either the single: # (task, job, exc, value, stack) # or something like: # [[(task, job, exc, value, stack)]] # # Generally cross-process exception marshalling doesn't work well # and ruffus doesn't support because BaseException has its own # implementation of __reduce__ that attempts to reconstruct the # exception based on e.__init__(e.args). # # Attempting to log the exception directly marshalls it to the logger # which is probably in another process, so it's better to log only # data from the exception at this point. exitcode = traverse_ruffus_exception(e.args, options, _log) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error else: return exitcode except ExitCodeException as e: return e.exit_code except Exception as e: _log.error(e) return ExitCode.other_error if options.flowchart: _log.info("Flowchart saved to {}".format(options.flowchart)) elif options.output_file != '-': if options.output_type == 'pdfa': pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: msg = 'Output file is a {} (as expected)' _log.info(msg.format(pdfa_info['conformance'])) else: msg = 'Output file is okay but is not PDF/A (seems to be {})' _log.warning(msg.format(pdfa_info['conformance'])) return ExitCode.invalid_output_pdf if not qpdf.check(options.output_file, _log): _log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf else: _log.info("Output sent to stdout") pdfinfo = context.get_pdfinfo() if options.verbose: from pprint import pformat _log.debug(pformat(pdfinfo)) direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'} orientations = [] for n, page in enumerate(pdfinfo): angle = pdfinfo[n].rotation or 0 if angle != 0: orientations.append('{0}{1}'.format( n + 1, direction.get(angle, ''))) if orientations: _log.info('Page orientations detected: ' + ' '.join(orientations)) return ExitCode.ok
(outfile, infiles[0], infiles[1])) out.close() @follows(exampleCombinations, examplePermutations, exampleProduct) def advancedRuffus(): ''' This is a dummy function to demonstrate the use of dummy functions to run subsections of the pipeline. Running the pipeline as make advancedRuffus will update, if needed, exampleCombinations, examplePermutations and exampleProduct, plus any prior steps they depend upon - these are exampleOriginate, exampleTransform and exampleSubdivide. exampleMerge, exampleSplit and exampleCollate will not be run. ''' @follows(basicRuffus, advancedRuffus) def full(): ''' All cgat pipelines should end with a full() function which updates, if needed, all branches of the pipeline. The @follows statement should ensure that all functions are covered, either directly or as prerequisites. ''' # this is essential to run the pipeline with ruffus cmdline.run(options)
dfm = dfo.join(dft) # Merge in preparation for comparison assert len(dfo) == len(dft) == len(dfm) results.append({ 'numerator': info['numerator'], 'denominator': info['denominator'], 'pearson': dfm.corr(method='pearson').loc['orig']['xform'], 'spearman': dfm.corr(method='spearman').loc['origRnk']['xformRnk'], 'kendall': dfm.corr(method='kendall').loc['origRnk']['xformRnk'] }) lg.info("xform::powall_cmp path::%s ::done" % xfpath) newk = 'comparison' ous[newk] = pd.DataFrame(results).set_index( ['numerator', 'denominator']) ous.get_storer(newk).attrs.info = ins.get_storer('orig').attrs.info finally: ins.close() ous.close() cmdline.run(options, checksum_level=rf.ruffus_utility.CHECKSUM_HISTORY_TIMESTAMPS, logger=lg)
pipe.transform( name="convert_csv_files_to_tsv", task_func=csv_to_tsv, input=output_from("create_three_new_files"), filter=suffix(".csv"), output=".tsv", ) pipe.transform( name="calculate_md5", task_func=md5, input=output_from("convert_csv_files_to_tsv"), filter=suffix(".tsv"), output=".md5sum", ) return pipe if __name__ == "__main__": parser = cmdline.get_argparse(description="CNV Calling", ignored_args=["jobs"]) options = parser.parse_args() options.history_file = os.path.join(WORK_DIR, ".ruffus_history.sqlite") pipeline = build_pipeline() cmdline.run(options, multithead=3)
# parser.add_argument('--pipeline', "-p", # type=str, # choices = ['sim_pure', 'sim_contam','real_pure','test_pipe'], # help="Defining which pipeline to run") parser.add_argument( '--config_file', "-cf", type=str, #metavar="config_file", help="yaml file with pipeline parameters") options = parser.parse_args() ## standard python logger which can be synchronised across concurrent Ruffus tasks ## define logging output with --log_file log_file_name logger, logger_mutex = cmdline.setup_logging(__name__, options.log_file, options.verbose) # if we are printing only if not options.just_print and \ not options.flowchart and \ not options.touch_files_only: config_file = file(options.config_file, 'r') config = yaml.load(config_file) pipeline1a = make_sipp(org_list=config['org_list'], config=config) cmdline.run(options, logger=logger) sys.exit()
def run_pipeline(): if not options.jobs: options.jobs = available_cpu_count() try: options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite') cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(e) # Yuck. Hunt through the ruffus exception to find out what the # return code is supposed to be. # Ruffus flattens the exception to a string, throwing away all kinds # of helpful details # task_name, job_name - ruffus status # exc_name - class name of exception # exc_value - irritating string that makes impossible to recover # exception object # exc_stack - string that contains traceback of exception for exc in e.args: task_name, job_name, exc_name, exc_value, exc_stack = exc if exc_name == 'builtins.SystemExit': match = re.search(r"\.(.+?)\)", exc_value) exit_code_name = match.groups()[0] exit_code = getattr(ExitCode, exit_code_name, 'other_error') return exit_code elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError': _log.error(cleanup_ruffus_error_message(exc_value)) return ExitCode.input_file elif exc_name == 'builtins.TypeError': # Even though repair_pdf will fail, ruffus will still try # to call split_pages with no input files, likely due to a bug if task_name == 'split_pages': _log.error("Input file '{0}' is not a valid PDF".format( options.input_file)) return ExitCode.input_file elif exc_name == 'subprocess.CalledProcessError': # It's up to the subprocess handler to report something useful msg = "Error occurred while running this command:" _log.error(msg + '\n' + exc_value) return ExitCode.child_process_error elif not options.verbose: _log.error(e) return ExitCode.other_error except Exception as e: _log.error(e) return ExitCode.other_error if not validate_pdfa(options.output_file, _log): _log.warning('Output file: The generated PDF/A file is INVALID') return ExitCode.invalid_output_pdfa with _pdfinfo_lock: _log.debug(_pdfinfo) direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'} orientations = [] for n, page in enumerate(_pdfinfo): angle = _pdfinfo[n].get('rotated', 0) if angle != 0: orientations.append('{0}{1}'.format( n + 1, direction.get(angle, ''))) if orientations: _log.info('Page orientations detected: ' + ' '.join(orientations)) return ExitCode.ok
@follows(merge_hf_vcf) @transform((filtrate_low_qual, select_snp_variants, select_indel_variants, hardfilter_indel_variants, hardfilter_snp_variants), suffix(vcf_ext), vcf_ext+zeroed_ext) def remove_intermediate_vcfs(in_vcf, out): zero_file(in_vcf) os.remove(in_vcf+'.idx') open(out, 'w').close() @follows(merge_hf_vcf) @transform(realign_indel, suffix(realignedbam_ext), realignedbam_ext+zeroed_ext) def remove_realigned_bam(in_fn, out_fn): zero_file(in_fn) os.remove(in_fn[:-1]+'i') open(out_fn, 'w').close() @follows(merge_hf_vcf) @transform(get_recal_group, suffix(recal_ext), recal_ext+zeroed_ext) def remove_read_group_file(in_fn, out_fn): zero_file(in_fn[0]) open(out_fn, 'w').close() options.history_file = '.gatk_exome_pipeline.ruffus_history.sqlite' cmdline.run(options, gnu_make_maximal_rebuild_mode=True, checksum_level=1, touch_file_only=True)
def run_pipeline(): options = parser.parse_args() options.verbose_abbreviated_path = 1 print("Inside of options is: " + options) if not check_closed_streams(options): return ExitCode.bad_args logger_args = {'verbose': options.verbose, 'quiet': options.quiet} _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy( logging_factory, __name__, logger_args) preamble(_log) check_options(options, _log) # Complain about qpdf version < 7.0.0 # Suppress the warning if in the test suite, since there are no PPAs # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis) if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'): complain( "You are using qpdf version {0} which has known issues including " "security vulnerabilities with certain malformed PDFs. Consider " "upgrading to version 7.0.0 or newer.".format(qpdf.version())) # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() # Performance is improved by setting Tesseract to single threaded. In tests # this gives better throughput than letting a smaller number of Tesseract # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this # variable, but harmless to set if ignored. os.environ.setdefault('OMP_THREAD_LIMIT', '1') check_environ(options, _log) if os.environ.get('PYTEST_CURRENT_TEST'): os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file try: work_folder = mkdtemp(prefix="com.github.ocrmypdf.") options.history_file = os.path.join( work_folder, 'ruffus_history.sqlite') start_input_file = os.path.join( work_folder, 'origin') check_input_file(options, _log, start_input_file) check_requested_output_file(options, _log) manager = JobContextManager() manager.register('JobContext', JobContext) # pylint: disable=no-member manager.start() context = manager.JobContext() # pylint: disable=no-member context.set_options(options) context.set_work_folder(work_folder) build_pipeline(options, work_folder, _log, context) atexit.register(cleanup_working_files, work_folder, options) cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to exceptions = e.job_exceptions exitcode = traverse_ruffus_exception(exceptions, options, _log) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error return exitcode except ExitCodeException as e: return e.exit_code except Exception as e: _log.error(str(e)) return ExitCode.other_error if options.flowchart: _log.info("Flowchart saved to {}".format(options.flowchart)) return ExitCode.ok elif options.output_file == '-': _log.info("Output sent to stdout") elif os.path.samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: msg = 'Output file is a {} (as expected)' _log.info(msg.format(pdfa_info['conformance'])) else: msg = 'Output file is okay but is not PDF/A (seems to be {})' _log.warning(msg.format(pdfa_info['conformance'])) return ExitCode.pdfa_conversion_failed if not qpdf.check(options.output_file, _log): _log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf report_output_file_size(options, _log, start_input_file, options.output_file) pdfinfo = context.get_pdfinfo() if options.verbose: from pprint import pformat _log.debug(pformat(pdfinfo)) log_page_orientations(pdfinfo, _log) return ExitCode.ok
def run_pipeline(): options = parser.parse_args() options.verbose_abbreviated_path = 1 if not check_closed_streams(options): return ExitCode.bad_args logger_args = {'verbose': options.verbose, 'quiet': options.quiet} _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy( logging_factory, __name__, logger_args) _log.debug('ocrmypdf ' + VERSION) _log.debug('tesseract ' + tesseract.version()) _log.debug('qpdf ' + qpdf.version()) check_options(options, _log) PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1000000) if PIL.Image.MAX_IMAGE_PIXELS == 0: PIL.Image.MAX_IMAGE_PIXELS = None # Complain about qpdf version < 7.0.0 # Suppress the warning if in the test suite, since there are no PPAs # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis) if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'): complain( "You are using qpdf version {0} which has known issues including " "security vulnerabilities with certain malformed PDFs. Consider " "upgrading to version 7.0.0 or newer.".format(qpdf.version())) # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() try: work_folder = mkdtemp(prefix="com.github.ocrmypdf.") options.history_file = os.path.join( work_folder, 'ruffus_history.sqlite') start_input_file = os.path.join( work_folder, 'origin') if options.input_file == '-': # stdin _log.info('reading file from standard input') with open(start_input_file, 'wb') as stream_buffer: from shutil import copyfileobj copyfileobj(sys.stdin.buffer, stream_buffer) else: try: re_symlink(options.input_file, start_input_file, _log) except FileNotFoundError: _log.error("File not found - " + options.input_file) return ExitCode.input_file if options.output_file == '-': if sys.stdout.isatty(): _log.error(textwrap.dedent("""\ Output was set to stdout '-' but it looks like stdout is connected to a terminal. Please redirect stdout to a file.""")) return ExitCode.bad_args elif not is_file_writable(options.output_file): _log.error( "Output file location (" + options.output_file + ") " + "is not a writable file.") return ExitCode.file_access_error manager = JobContextManager() manager.register('JobContext', JobContext) # pylint: disable=no-member manager.start() context = manager.JobContext() # pylint: disable=no-member context.set_options(options) context.set_work_folder(work_folder) build_pipeline(options, work_folder, _log, context) atexit.register(cleanup_working_files, work_folder, options) cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to # Ruffus flattens exception to 5 element tuples. Because of a bug # in <= 2.6.3 it may present either the single: # (task, job, exc, value, stack) # or something like: # [[(task, job, exc, value, stack)]] # # Generally cross-process exception marshalling doesn't work well # and ruffus doesn't support because BaseException has its own # implementation of __reduce__ that attempts to reconstruct the # exception based on e.__init__(e.args). # # Attempting to log the exception directly marshalls it to the logger # which is probably in another process, so it's better to log only # data from the exception at this point. exitcode = traverse_ruffus_exception(e.args, options, _log) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error return exitcode except ExitCodeException as e: return e.exit_code except Exception as e: _log.error(e) return ExitCode.other_error if options.flowchart: _log.info("Flowchart saved to {}".format(options.flowchart)) elif options.output_file == '-': _log.info("Output sent to stdout") elif os.path.samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: msg = 'Output file is a {} (as expected)' _log.info(msg.format(pdfa_info['conformance'])) else: msg = 'Output file is okay but is not PDF/A (seems to be {})' _log.warning(msg.format(pdfa_info['conformance'])) return ExitCode.invalid_output_pdf if not qpdf.check(options.output_file, _log): _log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf pdfinfo = context.get_pdfinfo() if options.verbose: from pprint import pformat _log.debug(pformat(pdfinfo)) log_page_orientations(pdfinfo, _log) return ExitCode.ok
# parser.add_argument('--pipeline', "-p", # type=str, # choices = ['sim_pure', 'sim_contam','real_pure','test_pipe'], # help="Defining which pipeline to run") parser.add_argument('--config_file', "-cf", type=str, #metavar="config_file", help="yaml file with pipeline parameters") options = parser.parse_args() ## standard python logger which can be synchronised across concurrent Ruffus tasks ## define logging output with --log_file log_file_name logger, logger_mutex = cmdline.setup_logging (__name__, options.log_file, options.verbose) # if we are printing only if not options.just_print and \ not options.flowchart and \ not options.touch_files_only: config_file= file(options.config_file, 'r') config = yaml.load(config_file) pipeline1a = make_sipp(org_list = config['org_list'], config = config) cmdline.run (options, logger = logger) sys.exit()
continue genome = line.rstrip().split("\t")[2] scores[genome] += 1 sorted_scores = sorted(scores.items(), reverse=True, key=operator.itemgetter(1)) file_root = sam_file.replace(".sorted.sam", "") fastq_file = file_root + ".fastq" bam_file = file_root + ".bam" sbam_file = file_root + ".sorted.bam" lines = wcl(fastq_file) num_lines = int(lines.split()[0]) num_reads = num_lines / 4 with open(data_file, "w+") as fh: fh.write("### DATA REPORT FOR {} ###\n".format()) fh.write("fastq lines: \n{}\n".format(lines)) fh.write("reads: {}\n".format(str(num_reads))) fh.write("\n### Genome Hit Data ###") for genome,score in sorted_scores: fh.write("{}\t{}\n".format(genome, str(score))) os.unlink(fastq_file) os.unlink(bam_file) os.unlink(sbam_file) # run the pipelined cmdline.run(options)
def run_pipeline(): # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) global options if not options.jobs: options.jobs = available_cpu_count() try: options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite') start_input_file = os.path.join(work_folder, 'origin') if options.input_file == '-': # stdin _log.info('reading file from standard input') with open(start_input_file, 'wb') as stream_buffer: from shutil import copyfileobj copyfileobj(sys.stdin.buffer, stream_buffer) else: try: re_symlink(options.input_file, start_input_file, _log) except FileNotFoundError: _log.error("File not found - " + options.input_file) return ExitCode.input_file cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to # Ruffus flattens exception to 5 element tuples. Because of a bug # in <= 2.6.3 it may present either the single: # (task, job, exc, value, stack) # or something like: # [[(task, job, exc, value, stack)]] # # Generally cross-process exception marshalling doesn't work well # and ruffus doesn't support because BaseException has its own # implementation of __reduce__ that attempts to reconstruct the # exception based on e.__init__(e.args). # # Attempting to log the exception directly marshalls it to the logger # which is probably in another process, so it's better to log only # data from the exception at this point. exitcode = traverse_ruffus_exception(e.args) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error else: return exitcode except Exception as e: _log.error(e) return ExitCode.other_error if options.output_type == 'pdfa': pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: msg = 'Output file is a {} (as expected)' _log.info(msg.format(pdfa_info['conformance'])) else: msg = 'Output file was generated but is not PDF/A (seems to be {})' _log.warning(msg.format(pdfa_info['conformance'])) return ExitCode.invalid_output_pdf if not qpdf.check(options.output_file, _log): _log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf with _pdfinfo_lock: _log.debug(_pdfinfo) direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'} orientations = [] for n, page in enumerate(_pdfinfo): angle = _pdfinfo[n].get('rotated', 0) if angle != 0: orientations.append('{0}{1}'.format(n + 1, direction.get(angle, ''))) if orientations: _log.info('Page orientations detected: ' + ' '.join(orientations)) return ExitCode.ok
def run_pipeline(): options = parser.parse_args() options.verbose_abbreviated_path = 1 if not check_closed_streams(options): return ExitCode.bad_args logger_args = {'verbose': options.verbose, 'quiet': options.quiet} _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy( logging_factory, __name__, logger_args) _log.debug('ocrmypdf ' + VERSION) _log.debug('tesseract ' + tesseract.version()) _log.debug('qpdf ' + qpdf.version()) check_options(options, _log) PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1000000) if PIL.Image.MAX_IMAGE_PIXELS == 0: PIL.Image.MAX_IMAGE_PIXELS = None # Complain about qpdf version < 7.0.0 # Suppress the warning if in the test suite, since there are no PPAs # for qpdf 7.0.0 for Ubuntu trusty (i.e. Travis) if qpdf.version() < '7.0.0' and not os.environ.get('PYTEST_CURRENT_TEST'): complain( "You are using qpdf version {0} which has known issues including " "security vulnerabilities with certain malformed PDFs. Consider " "upgrading to version 7.0.0 or newer.".format(qpdf.version())) # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() try: work_folder = mkdtemp(prefix="com.github.ocrmypdf.") options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite') start_input_file = os.path.join(work_folder, 'origin') if options.input_file == '-': # stdin _log.info('reading file from standard input') with open(start_input_file, 'wb') as stream_buffer: from shutil import copyfileobj copyfileobj(sys.stdin.buffer, stream_buffer) else: try: re_symlink(options.input_file, start_input_file, _log) except FileNotFoundError: _log.error("File not found - " + options.input_file) return ExitCode.input_file if options.output_file == '-': if sys.stdout.isatty(): _log.error( textwrap.dedent("""\ Output was set to stdout '-' but it looks like stdout is connected to a terminal. Please redirect stdout to a file.""")) return ExitCode.bad_args elif not is_file_writable(options.output_file): _log.error("Output file location (" + options.output_file + ") " + "is not a writable file.") return ExitCode.file_access_error manager = JobContextManager() manager.register('JobContext', JobContext) # pylint: disable=no-member manager.start() context = manager.JobContext() # pylint: disable=no-member context.set_options(options) context.set_work_folder(work_folder) build_pipeline(options, work_folder, _log, context) atexit.register(cleanup_working_files, work_folder, options) cmdline.run(options) except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to # Ruffus flattens exception to 5 element tuples. Because of a bug # in <= 2.6.3 it may present either the single: # (task, job, exc, value, stack) # or something like: # [[(task, job, exc, value, stack)]] # # Generally cross-process exception marshalling doesn't work well # and ruffus doesn't support because BaseException has its own # implementation of __reduce__ that attempts to reconstruct the # exception based on e.__init__(e.args). # # Attempting to log the exception directly marshalls it to the logger # which is probably in another process, so it's better to log only # data from the exception at this point. exitcode = traverse_ruffus_exception(e.args, options, _log) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error return exitcode except ExitCodeException as e: return e.exit_code except Exception as e: _log.error(e) return ExitCode.other_error if options.flowchart: _log.info("Flowchart saved to {}".format(options.flowchart)) elif options.output_file == '-': _log.info("Output sent to stdout") elif os.path.samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: msg = 'Output file is a {} (as expected)' _log.info(msg.format(pdfa_info['conformance'])) else: msg = 'Output file is okay but is not PDF/A (seems to be {})' _log.warning(msg.format(pdfa_info['conformance'])) return ExitCode.invalid_output_pdf if not qpdf.check(options.output_file, _log): _log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf pdfinfo = context.get_pdfinfo() if options.verbose: from pprint import pformat _log.debug(pformat(pdfinfo)) log_page_orientations(pdfinfo, _log) return ExitCode.ok
def f_alogFamily(inputFiles, outputFiles): touch(outputFiles) #--------------------------------------------------------------- # homeobox figure # @merge(homeobox_R, "ruffus/figure.f_hb") def f_hb(inputFiles, outputFiles): touch(outputFiles) #--------------------------------------------------------------- # data s1 # @merge([calculateTpm_R, downloadGenomes_sh], "ruffus/datas1") def datas1_R(inputFiles, outputFiles): jobScript = 'src/R/datas1.R' ntasks = '1' cpus_per_task = '1' job_name = 'datas1_R' jobId = submit_job(jobScript, ntasks, cpus_per_task, job_name) # update ruffus flag print("[", print_now(), ": Job " + job_name + " run with JobID " + jobId + " ]") touch(outputFiles) # options for visualising pipeline_printout() pipeline_printout_graph("ruffus/flowchart." + slurm_jobid + ".pdf", "pdf") # run the pipeline (disabled for now) cmdline.run(options, multithread = 8)
hardfilter_indel_variants, hardfilter_snp_variants), suffix(vcf_ext), vcf_ext + zeroed_ext) def remove_intermediate_vcfs(in_vcf, out): zero_file(in_vcf) os.remove(in_vcf + '.idx') open(out, 'w').close() @follows(merge_hf_vcf) @transform(realign_indel, suffix(realignedbam_ext), realignedbam_ext + zeroed_ext) def remove_realigned_bam(in_fn, out_fn): zero_file(in_fn) os.remove(in_fn[:-1] + 'i') open(out_fn, 'w').close() @follows(merge_hf_vcf) @transform(get_recal_group, suffix(recal_ext), recal_ext + zeroed_ext) def remove_read_group_file(in_fn, out_fn): zero_file(in_fn[0]) open(out_fn, 'w').close() options.history_file = '.gatk_exome_pipeline.ruffus_history.sqlite' cmdline.run(options, gnu_make_maximal_rebuild_mode=True, checksum_level=1, touch_file_only=True)