def ruffus_main(options, args): 'Main entry point for ruffus pipelines' if options.just_print: pipeline_printout( sys.stdout, options.target_tasks, options.forced_tasks, verbose=options.verbose) elif options.flowchart: pipeline_printout_graph( open(options.flowchart, "w"), # use flowchart file name extension to decide flowchart format # e.g. svg, jpg etc. os.path.splitext(options.flowchart)[1][1:], options.target_tasks, options.forced_tasks, no_key_legend=not options.key_legend_in_graph) else: pipeline_run( options.target_tasks, options.forced_tasks, multiprocess=options.jobs, logger=main_logger, verbose=options.verbose, touch_files_only=options.touch_only)
def test_graphviz_dot(self): """Make sure annotations from graphviz appear in dot """ if sys.hexversion >= 0x03000000: # everything is unicode in python3 s = BytesIO() else: s = StringIO() pipeline_printout_graph( s, # use flowchart file name extension to decide flowchart format # e.g. svg, jpg etc. "dot", [Final_target, Up_to_date_final_target], pipeline="main") self.assertTrue('[URL="http://cnn.com", color="#FF0000", fillcolor="#FFCCCC", fontcolor="#4B6000", height=1.5, label=<What is this?<BR/> What <FONT COLOR="red">is</FONT>this???>, pencolor="#FF0000", peripheries=5, shape=component, style=dashed]' in s.getvalue().decode())
def ruffus_main(options, args): 'Main entry point for ruffus pipelines' if options.just_print: pipeline_printout(sys.stdout, options.target_tasks, options.forced_tasks, verbose=options.verbose) elif options.flowchart: pipeline_printout_graph( open(options.flowchart, "w"), # use flowchart file name extension to decide flowchart format # e.g. svg, jpg etc. os.path.splitext(options.flowchart)[1][1:], options.target_tasks, options.forced_tasks, no_key_legend=not options.key_legend_in_graph) else: pipeline_run(options.target_tasks, options.forced_tasks, multiprocess=options.jobs, logger=main_logger, verbose=options.verbose, touch_files_only=options.touch_only)
def main(args=sys.argv): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ global GLOBAL_OPTIONS global GLOBAL_ARGS parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=( "make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force running the pipeline even if there " "are uncommited changes " "in the repository [default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitly set paramater values " "[default=%default].") parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", type="string", help="RabbitMQ exchange to send log messages to " "[default=%default].") parser.add_option("--rabbitmq-host", dest="rabbitmq_host", type="string", help="RabbitMQ host to send log messages to " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.set_defaults( pipeline_action=None, pipeline_format="svg", pipeline_targets=[], multiprocess=40, logfile="pipeline.log", dry_run=False, force=False, log_exceptions=False, exceptions_terminate_immediately=False, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, rabbitmq_host="saruman", rabbitmq_exchange="ruffus_pipelines", input_validation=False) (options, args) = E.Start(parser, add_cluster_options=True) GLOBAL_OPTIONS, GLOBAL_ARGS = options, args E.info("Started in: %s" % PARAMS.get("workingdir")) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. PARAMS["dryrun"] = options.dry_run PARAMS["input_validation"] = options.input_validation # use cli_cluster_* keys in PARAMS to ensure highest priority # of cluster_* options passed with the command-line if options.cluster_memory_default is not None: PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default PARAMS["cluster_memory_default"] = options.cluster_memory_default if options.cluster_memory_resource is not None: PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource PARAMS["cluster_memory_resource"] = options.cluster_memory_resource if options.cluster_num_jobs is not None: PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs PARAMS["cluster_num_jobs"] = options.cluster_num_jobs if options.cluster_options is not None: PARAMS["cli_cluster_options"] = options.cluster_options PARAMS["cluster_options"] = options.cluster_options if options.cluster_parallel_environment is not None: PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment if options.cluster_priority is not None: PARAMS["cli_cluster_priority"] = options.cluster_priority PARAMS["cluster_priority"] = options.cluster_priority if options.cluster_queue is not None: PARAMS["cli_cluster_queue"] = options.cluster_queue PARAMS["cluster_queue"] = options.cluster_queue if options.cluster_queue_manager is not None: PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager PARAMS["cluster_queue_manager"] = options.cluster_queue_manager PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level for variables in options.variables_to_set: variable, value = variables.split("=") PARAMS[variable.strip()] = IOTools.str2val(value.strip()) if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) # see inputValidation function in Parameters.py if options.input_validation: inputValidation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: E.info("\t".join(map(str, requirement))) E.info("version check summary: %s" % str(counter)) E.Stop() return elif options.pipeline_action == "debug": # create the session proxy startSession() method_name = options.pipeline_targets[0] caller = getCaller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "svg", "plot", "touch", "regenerate"): # set up extra file logger handler = logging.FileHandler(filename=options.logfile, mode="a") handler.setFormatter( MultiLineFormatter( '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s')) logger = logging.getLogger() logger.addHandler(handler) messenger = None try: if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = io.StringIO() pipeline_printout( stream, options.pipeline_targets, verbose=5, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterRabbitMQ( stream.getvalue(), project_name=getProjectName(), pipeline_name=getPipelineName(), host=options.rabbitmq_host, exchange=options.rabbitmq_exchange) logger.addFilter(messenger) if not options.without_cluster and HAS_DRMAA: global task # use threading instead of multiprocessing in order to # limit the number of concurrent jobs by using the # GIL # # Note that threading might cause problems with rpy. task.Pool = ThreadPool # create the session proxy startSession() elif not options.without_cluster and not HAS_DRMAA: E.critical("DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # # make sure we are not logging at the same time in # different processes # # session_mutex = manager.Lock() E.info(E.GetHeader()) E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) E.info("Working directory is: %s" % PARAMS["workingdir"]) pipeline_run( options.pipeline_targets, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options.exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, ) E.info(E.GetFooter()) closeSession() elif options.pipeline_action == "show": pipeline_printout( options.stdout, options.pipeline_targets, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, verbose=options.loglevel) elif options.pipeline_action == "svg": pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus_exceptions.RethrownJobError as value: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(value.args)) for idx, e in enumerate(value.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) pass else: task = re.sub("__main__.", "", task) job = re.sub("\s", "", job) if messenger: messenger.send_error(task, job, error, msg) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.logfile) # write full traceback to log file only by removing the stdout # handler lhStdout = logger.handlers[0] logger.removeHandler(lhStdout) logger.error("start of error messages") logger.error(value) logger.error("end of error messages") logger.addHandler(lhStdout) # raise error raise ValueError( "pipeline failed with %i errors" % len(value.args)) else: raise elif options.pipeline_action == "dump": print(json.dumps(PARAMS)) elif options.pipeline_action == "printconfig": print("Printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) printConfigFiles() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.Stop()
def main(): # prepare the ruffus pipeline main_pipeline = ruffus.Pipeline.pipelines["main"] # catch jgi logon and password from cli parser = ruffus.cmdline.get_argparse(description='UV-B analysis pipeline.') parser.add_argument('--email', '-e', help='Logon email address for JGI', type=str, dest='jgi_logon') parser.add_argument('--password', '-p', help='JGI password', type=str, dest='jgi_password') options = parser.parse_args() jgi_logon = options.jgi_logon jgi_password = options.jgi_password # need a dictionary of species to genome URL and species to gff. # supply this in a text file fasta_urls = {} annotation_urls = {} with open('data/genomeUrls.txt') as tsv: genome_urls = csv.reader(tsv, delimiter='\t') next(genome_urls, None) for row in genome_urls: fasta_urls[row[0]] = row[1] annotation_urls[row[0]] = row[2] # iterate over fasta_urls keys to run jobs for species in fasta_urls.keys(): # call download script main_pipeline.originate( name=species + "_genome", task_func=download_genome, output="data/genome/" + species + "/METADATA.csv", extras=[species, fasta_urls[species], annotation_urls[species], jgi_logon, jgi_password]) # generate a star genome for each species main_pipeline.transform( name=species + "_index", task_func=generate_index, input=ruffus.output_from(species + "_genome"), filter=ruffus.regex(r"data/genome/(.*)/METADATA.csv"), output=r"output/\1/star-index/METADATA.csv", extras=[r"\1"]) # define the reads main_pipeline.originate(name=species + "_reads", task_func=define_reads, output="ruffus/" + species + "_reads", extras=[species]) # first mapping step main_pipeline.collate( name=species + "_mapped_reads", task_func=star, input=[[ruffus.output_from(species + "_reads"), ruffus.output_from(species + "_index")]], filter=ruffus.formatter(), output=["output/{subdir[1][1]}/star/METADATA.csv"], extras=["{subdir[1][1]}"]) # FOR LOOP ENDS # parse the mapping stats mapping_stats = main_pipeline.merge( task_func=parse_star_stats_R, input=ruffus.output_from( list(species + "_mapped_reads" for species in fasta_urls.keys())), output="output/mapping_stats/SessionInfo.txt") # generate plots for mapping mapping_plots = main_pipeline.transform( task_func=plot_reads_in_genes_R, input=mapping_stats, filter=ruffus.formatter(), output="{subpath[0][0]}/Figure S1.pdf") # use generator in the input field to collate the previous results deseq_results = main_pipeline.transform( task_func=deseq2_R, input=ruffus.output_from( list(species + "_mapped_reads" for species in fasta_urls.keys())), filter=ruffus.formatter(), output=[r"output/{subdir[0][1]}/deseq2/SessionInfo.txt"], extras=[r"{subdir[0][1]}"]) # combine the deseq results de_lists = main_pipeline.merge( task_func=list_de_genes_R, input=deseq_results, output="output/merged/deseq2/SessionInfo.de_genes.txt") # run clustering mfuzz_results = main_pipeline.transform( task_func=mfuzz_R, input=deseq_results, filter=ruffus.formatter(), output='output/{subdir[0][1]}/mfuzz/SessionInfo.mfuzz.txt', extras=['{subdir[0][1]}']) # combine mfuzz_results mfuzz_plot = main_pipeline.merge( task_func=combine_mfuzz_results_R, input=mfuzz_results, output='output/merged/mfuzz/SessionInfo.mfuzz.txt') # compare flavonoid synthesis genes flavonoid_genes = main_pipeline.transform( task_func=compare_saito_genes_R, input=de_lists, filter=ruffus.formatter(), output='{path[0]}/SessionInfo.flavonoid_synthesis.txt') # run the pipeline ruffus.cmdline.run(options, multithread=8) # print the flowchart ruffus.pipeline_printout_graph("ruffus/flowchart.pdf", "pdf", pipeline_name="UV-B analysis pipeline")
def run_workflow(options, args, pipeline=None): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- pipeline: object pipeline to run. If not given, all ruffus pipelines are run. """ logger = logging.getLogger("cgatcore.pipeline") if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) if options.force_run: if options.force_run == "all": forcedtorun_tasks = ruffus.pipeline_get_task_names() else: forcedtorun_tasks = options.pipeline_targets else: forcedtorun_tasks = [] # create local scratch if it does not already exists. Note that # directory itself will be not deleted while its contents should # be cleaned up. if not os.path.exists(get_params()["tmpdir"]): logger.warn( "local temporary directory {} did not exist - created".format( get_params()["tmpdir"])) try: os.makedirs(get_params()["tmpdir"]) except OSError: # file exists pass logger.debug("temporary directory is {}".format(get_params()["tmpdir"])) # set multiprocess to a sensible setting if there is no cluster run_on_cluster = HAS_DRMAA is True and not options.without_cluster if options.multiprocess is None: if not run_on_cluster: options.multiprocess = int( math.ceil(multiprocessing.cpu_count() / 2.0)) else: options.multiprocess = 40 # see inputValidation function in Parameters.py if options.input_validation: input_validation(get_params(), sys.argv[0]) elif options.pipeline_action == "debug": # create the session proxy start_session() method_name = options.pipeline_targets[0] caller = get_caller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "state", "svg", "plot", "dot", "touch", "regenerate"): messenger = None try: with cache_os_functions(): if options.pipeline_action == "make": if not options.without_cluster and not HAS_DRMAA and not get_params( )['testing']: E.critical( "DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # get tasks to be done. This essentially replicates # the state information within ruffus. stream = StringIO() ruffus.pipeline_printout( stream, options.pipeline_targets, verbose=5, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterProgress(stream.getvalue()) logger.addFilter(messenger) global task if options.without_cluster: # use ThreadPool to avoid taking multiple CPU for pipeline # controller. opts = {"multithread": options.multiprocess} else: # use cooperative multitasking instead of multiprocessing. opts = { "multiprocess": options.multiprocess, "pool_manager": "gevent" } # create the session proxy start_session() logger.info("current directory is {}".format(os.getcwd())) ruffus.pipeline_run( options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, pipeline=pipeline, one_second_per_job=False, **opts) close_session() elif options.pipeline_action == "show": ruffus.pipeline_printout( options.stdout, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, pipeline=pipeline, verbose=options.loglevel) elif options.pipeline_action == "svg": ruffus.pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "state": ruffus.ruffus_return_dag( options.stdout, target_tasks=options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() ruffus.pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus.ruffus_exceptions.RethrownJobError as ex: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(ex.args)) for idx, e in enumerate(ex.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) else: task = re.sub("__main__.", "", task) job = re.sub(r"\s", "", job) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.pipeline_logfile) logger.error("start of all error messages") logger.error(ex) logger.error("end of all error messages") raise ValueError("pipeline failed with %i errors" % len(ex.args)) from ex else: raise elif options.pipeline_action == "dump": options.stdout.write((json.dumps(get_params())) + "\n") elif options.pipeline_action == "printconfig": E.info("printing out pipeline parameters: ") p = get_params() for k in sorted(get_params()): print(k, "=", p[k]) print_config_files() elif options.pipeline_action == "config": # Level needs to be 2: # 0th level -> cgatflow.py # 1st level -> Control.py # 2nd level -> pipeline_xyz.py f = sys._getframe(2) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") write_config_files(pipeline_path, general_path) elif options.pipeline_action == "clone": clone_pipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.stop(logger=get_logger())
def test_ruffus(self): print(" Run pipeline normally...") if self.graph_viz_present: pipeline_printout_graph(tempdir + "flowchart.dot", pipeline="main") pipeline_printout_graph(tempdir + "flowchart.jpg", target_tasks=[subdivide_start], forcedtorun_tasks=[split_start], no_key_legend=True) pipeline_printout_graph(tempdir + "flowchart.svg", no_key_legend=False, pipeline="main") # Unknown format try: pipeline_printout_graph(tempdir + "flowchart.unknown", no_key_legend=False, pipeline="main") raise Exception( "Failed to throw exception for pipeline_printout_graph unknown extension " ) except CalledProcessError as err: pass pipeline_printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend=False, pipeline="main") else: pipeline_printout_graph(tempdir + "flowchart.dot", target_tasks=[subdivide_start], forcedtorun_tasks=[split_start], no_key_legend=True, pipeline="main")
def main(): ######### # SETUP # ######### # test function for checking input/output passed to job_script and parsing # by src/sh/io_parser test_job_function = tompltools.generate_job_function( job_script='src/sh/io_parser', job_name='test', verbose=True) # parse email etc. here? parser = ruffus.cmdline.get_argparse( description='ASW genome assembly pipeline.') parser.add_argument('--blast-db-folder', help='Path to BLAST db folder', type=str, dest='blast_db_folder') # parser.add_argument('--email', '-e', # help='Logon email address for JGI', # type=str, # dest='jgi_logon') # parser.add_argument('--password', '-p', # help='JGI password', # type=str, # dest='jgi_password') options = parser.parse_args() # jgi_logon = options.jgi_logon # jgi_password = options.jgi_password if options.blast_db_folder: os.environ['BLASTDB'] = options.blast_db_folder # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines['main'] # find fastq.gz files dir_listing = [x[0] for x in os.walk(top='data', followlinks=True)] fastq_file_list = [] for directory in dir_listing: file_list = os.scandir(directory) fastq_file_list.append([x.path for x in file_list if (x.name.endswith('fastq.gz') or x.name.endswith('.fastq')) and x.is_file()]) fastq_files = list(tompytools.flatten_list(fastq_file_list)) # extract only MH gDNA fastq data, i.e. # 2125-06-11-1 = MH PE # 2125-06-06-1 = MH MP active_fq_files = [x for x in fastq_files if ('2125-06-11-1' in x or '2125-06-06-1' in x)] # load files into ruffus raw_fq_files = main_pipeline.originate( name='raw_fq_files', task_func=os.path.isfile, output=active_fq_files) # merge libraries merged_fq_files = main_pipeline.collate( name='merged_fq_files', task_func=tompltools.generate_job_function( job_script='src/sh/merge_fq', job_name='merge_fq'), input=raw_fq_files, filter=ruffus.formatter( r'data/NZGL02125/.*/' '[^-]+-(?P<LIB>[^_]+).+_R(?P<RN>\d)_.*.fastq.gz'), output=[r'output/fq_merged/{LIB[0]}_R{RN[0]}_merged.fastq.gz']) # make pairs and send to cutadapt for trimming external adaptors trim_cutadapt = main_pipeline.collate( name='trim_cutadapt', task_func=tompltools.generate_job_function( job_script='src/sh/cutadapt_pe', job_name='cutadapt'), input=merged_fq_files, filter=ruffus.formatter( r'.+/(?P<LIB>[^_]+)_R(?P<RN>\d)_merged.fastq.gz'), output=[['output/cutadapt/{LIB[0]}_R1_trimmed.fastq.gz', 'output/cutadapt/{LIB[0]}_R2_trimmed.fastq.gz']]) # send trimmed reads to splitnextera mp_splitnextera = main_pipeline.subdivide( name='splitnextera', task_func=tompltools.generate_job_function( job_script='src/sh/splitnextera', job_name='splitnextera'), input=trim_cutadapt, filter=ruffus.regex( r'.+?/2125-06-06-1_R(?P<RN>\d)_trimmed.fastq.gz'), output=['output/splitnextera/2125-06-06-1.pe.fastq.gz', 'output/splitnextera/2125-06-06-1.se.fastq.gz', 'output/splitnextera/2125-06-06-1.mp.fastq.gz', 'output/splitnextera/2125-06-06-1.unknown.fastq.gz']) # decontaminate PhiX (other?) sequences decon_mp = main_pipeline.transform( name='decon_mp', task_func=tompltools.generate_job_function( job_script='src/sh/decon', job_name='decon_mp'), input=mp_splitnextera, filter=ruffus.formatter( r'.+/2125-06-06-1\.(?P<VL>[^.]+)\.fastq.gz'), output=['output/decon/2125-06-06-1_{VL[0]}.fastq.gz']) decon_pe = main_pipeline.transform( name='decon_pe', task_func=tompltools.generate_job_function( job_script='src/sh/decon', job_name='decon_pe'), input=trim_cutadapt, filter=ruffus.regex( r'.+?/2125-06-11-1_R(?P<RN>\d)_trimmed.fastq.gz'), output=[r'output/decon/2125-06-11-1.fastq.gz']) decon = [decon_mp, decon_pe] # digital normalisation and error correction w/ bbnorm bbnorm = main_pipeline.subdivide( name='bbnorm', task_func=tompltools.generate_job_function( job_script='src/sh/bbnorm', job_name='bbnorm', mem_per_cpu=7000, cpus_per_task=8), input=decon, filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), output=[r'output/bbnorm/{LN[0]}{VL[0]}.fastq.gz']) # download NCBI databases for taxonomy data download_taxonomy_databases = main_pipeline.originate( name='download_taxonomy_databases', task_func=tompltools.generate_job_function( job_script='src/r/download_taxonomy_databases.R', job_name='download_taxonomy_databases', job_type='originate'), output=[['data/ncbi/nucl_gb.accession2taxid.Rds', 'data/ncbi/nodes.dmp.Rds', 'data/ncbi/names.dmp.Rds']]) # subsample reads, blast with biopython and parse results fq_subsample = main_pipeline.subdivide( name='fq_subsample', task_func=tompltools.generate_job_function( job_script='src/sh/fq_subsample', job_name='fq_subsample'), input=bbnorm, filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), output=[r'output/blastqc/{LN[0]}{VL[0]}_R1.fastq.gz', r'output/blastqc/{LN[0]}{VL[0]}_R2.fastq.gz']) blast_reads = main_pipeline.transform( name='blast_reads', task_func=tompltools.generate_job_function( job_script='src/py/blast_reads.py', job_name='blast_reads', cpus_per_task=4), input=fq_subsample, filter=ruffus.suffix('.fastq.gz'), output=['.xml']) parse_blast_results = main_pipeline.transform( name='parse_blast_results', task_func=tompltools.generate_job_function( job_script='src/py/parse_blast_results.py', job_name='parse_blast_results'), input=blast_reads, filter=ruffus.suffix('.xml'), output=['.table']) main_pipeline.merge( name='plot_blast_resuts', task_func=tompltools.generate_job_function( job_script='src/r/extract_blast_hits_per_taxid.R', job_name='plot_blast_resuts'), input=[parse_blast_results, download_taxonomy_databases], output=['output/blastqc/plots.pdf']) # trim reads to 100 bp for edena? clip_to_100b = main_pipeline.subdivide( name='clip_to_100b', task_func=tompltools.generate_job_function( job_script='src/sh/clip_to_100b', job_name='clip_to_100b'), input=bbnorm, # filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), filter=ruffus.regex(r'.+/2125-06-11-1.fastq.gz'), output=[r'output/trunc_100/2125-06-11-1_R1.fastq.gz', r'output/trunc_100/2125-06-11-1_R2.fastq.gz']) # print raw and normalised kmer distribution plots main_pipeline.merge( name='kmer_distribution_plots', task_func=tompltools.generate_job_function( job_script='src/r/kmer_distribution_plots.R', job_name='kmer_distribution_plots'), input=bbnorm, output=['output/bbnorm/plots.pdf', 'output/bbnorm/plot_data.Rds']) # run fastqc on decontaminated and normalised libraries main_pipeline.transform( name='fastqc', task_func=tompltools.generate_job_function( job_script='src/sh/fastqc', job_name='fastqc', cpus_per_task=1), input=bbnorm, filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'), output=[r'output/fastqc/{LN[0]}{VL[0]}_fastqc.html']) # overlap step with edena # edena_overlaps = main_pipeline.collate( # name='edena_overlaps', # task_func=tompltools.generate_job_function( # job_script='src/sh/edena_overlaps', # job_name='edena_overlaps'), # input=clip_to_100b, # filter=ruffus.formatter(r'.+/(?P<LN>[^_]+)_R\d.fastq.gz'), # output=[r'output/edena/{LN[0]}.ovc']) # prepare files with velveth # set threads for velvet to 1 !!! min_kmer = 71 max_kmer = 87 step = 8 kmer_lengths = [x for x in range(min_kmer, max_kmer + 1, step)] velveth_output = list( tompytools.flatten_list( [('output/velveth_' + str(x) + '/Sequences') for x in kmer_lengths])) # velveth = main_pipeline.merge( # name='hash_files', # task_func=test_job_function, # # task_func=tompltools.generate_job_function( # # job_script='src/sh/hash_files', # # job_name='hash_files'), # input=bbnorm, # output=velveth_output) ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph( "ruffus/flowchart.pdf", "pdf", pipeline_name="ASW genome assembly pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=8)
def run_pipeline(): #============================================================================================== # Config command line parser #============================================================================================== parser = OptionParser('nextgen_pipeline [option]... pipeline...') parser.add_option('-s', '--stage', dest='stage', help='stage of pipeline to run') parser.add_option('-t', '--threads', action='store', type='int', dest='threads', default=None, help='number of threads to use') parser.add_option('-j', '--jobs', action='store', type='int', dest='jobs', default=None, help='number of jobs to use') parser.add_option('-f', '--force', action='store_true', dest='force_run', default=False, help='Force pipeline to run stage') parser.add_option('-c', '--config', dest='config', default='pipeline.cfg', help='config file, defaults to pipeline.cfg') parser.add_option('-l', '--log', dest='log', default=None, help='path to log file') parser.add_option('--graph', dest='print_graph', default=False, action='store_true', help='Print a graph of the pipeline rather than running it') (options, args) = parser.parse_args() if len(args) < 1: args = pipelines # Configuration parsing if not os.path.isfile(options.config): parser.error('Could not find config file: %s' % options.config) config = SafeConfigParser() config.read(options.config) for section in config.sections(): for option in config.options(section): CMD_DICT[option] = config.get(section, option) # get number of cpus on machine ncpus = multiprocessing.cpu_count() # load the pipeline(s) request by the user pipeline_stages = {} for arg in args: try: pipeline = __import__(PIPELINE_PATH + arg, globals(), locals(), ['*']) except ImportError, TypeError: # either no pipeline was requested or a missing/non-existant # pipeline was chosen print(parser.usage + '\n') show_pipeline_help(pipelines) pipeline_stages.update({arg: pipeline.stages_dict}) # did the user specify a stage if options.stage: if options.stage not in pipeline_stages[arg].keys(): # missing or non-existant stage chosen show_pipeline_stage_help() else: start_stage = pipeline_stages[arg][options.stage] else: # user did not specify a stage, use default start_stage = pipeline_stages[arg]['default'] # user specified job count, capped at the number of cpus if options.jobs: NUM_JOBS = options.jobs if options.jobs <= ncpus else ncpus else: NUM_JOBS = config.getint('jobs') \ if config.has_option('Threading', 'jobs') else ncpus / 2 # user specified log file if options.log: (head, _) = os.path.split(options.log) if os.path.exists(head): log_fn = options.log else: print "Unable to write to that log file." sys.exit(1) else: cwd = os.getcwd() now = datetime.datetime.now() ts = now.strftime('%Y-%m-%d_%H%M%S') log_fn = '%s/%s.%s.log' % (cwd, os.path.split(sys.argv[0])[1], ts) logger = quick_start_log(log_fn=log_fn) logger.debug('pipeline_run: %d jobs' % NUM_JOBS) # user said to force running of stage if options.print_graph: pipeline_printout_graph('pipeline.jpg', 'jpg', [start_stage], forcedtorun_tasks=([start_stage] if options.force_run else [])) else: pipeline_run([start_stage], forcedtorun_tasks=([start_stage] if options.force_run else []), multiprocess=NUM_JOBS, logger=logger)
def main(options, args, pipeline=None): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- options: object Container for command line arguments. args : list List of command line arguments. pipeline: object Pipeline to run. If not given, all ruffus pipelines are run. """ global GLOBAL_OPTIONS global GLOBAL_ARGS GLOBAL_OPTIONS, GLOBAL_ARGS = options, args logger = logging.getLogger("daisy.pipeline") logger.info("started in workingdir: {}".format(PARAMS.get("workingdir"))) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. update_params_with_commandline_options(PARAMS, options) version = get_version() if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) if options.force_run: if options.force_run == "all": forcedtorun_tasks = pipeline_get_task_names() else: forcedtorun_tasks = options.pipeline_targets else: forcedtorun_tasks = [] # create local scratch if it does not already exists. Note that # directory itself will be not deleted while its contents should # be cleaned up. if not os.path.exists(PARAMS["tmpdir"]): logger.warn( "local temporary directory {} did not exist - created".format( PARAMS["tmpdir"])) try: os.makedirs(PARAMS["tmpdir"]) except OSError: # file exists pass logger.debug("temporary directory is {}".format(PARAMS["tmpdir"])) # set multiprocess to a sensible setting if there is no cluster run_on_cluster = HAS_DRMAA is True and not options.without_cluster if options.multiprocess is None: if not run_on_cluster: options.multiprocess = int( math.ceil(multiprocessing.cpu_count() / 2.0)) else: options.multiprocess = 40 # see inputValidation function in Parameters.py if options.input_validation: input_validation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: logger.info("\t".join(map(str, requirement))) logger.info("version check summary: %s" % str(counter)) E.stop() return elif options.pipeline_action == "debug": # create the session proxy start_session() method_name = options.pipeline_targets[0] caller = get_caller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "state", "svg", "plot", "dot", "touch", "regenerate"): messenger = None try: with cache_os_functions(): if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = StringIO() ruffus.pipeline_printout( stream, options.pipeline_targets, verbose=5, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterProgress(stream.getvalue()) logger.addFilter(messenger) global task if options.without_cluster: # use ThreadPool to avoid taking multiple CPU for pipeline # controller. ruffus.task.Pool = ThreadPool else: # use cooperative multitasking instead of multiprocessing. ruffus.task.Pool = EventPool ruffus.task.queue = gevent.queue # create the session proxy start_session() logger.info("code location: {}".format( PARAMS["scriptsdir"])) logger.info("code version: {}".format(version)) logger.info("working directory is: {}".format( PARAMS["workingdir"])) ruffus.pipeline_run( options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, pipeline=pipeline, one_second_per_job=False, ) close_session() elif options.pipeline_action == "show": ruffus.pipeline_printout( options.stdout, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, pipeline=pipeline, verbose=options.loglevel) elif options.pipeline_action == "svg": ruffus.pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "state": ruffus.ruffus_return_dag( options.stdout, target_tasks=options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() ruffus.pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus.ruffus_exceptions.RethrownJobError as ex: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(ex.args)) for idx, e in enumerate(ex.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) else: task = re.sub("__main__.", "", task) job = re.sub(r"\s", "", job) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.pipeline_logfile) logger.error("start of all error messages") logger.error(ex) logger.error("end of all error messages") raise ValueError("pipeline failed with %i errors" % len(ex.args)) from ex else: raise elif options.pipeline_action == "dump": options.stdout.write((json.dumps(PARAMS)) + "\n") elif options.pipeline_action == "printconfig": E.info("printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) print_config_files() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") write_config_files(pipeline_path, general_path) elif options.pipeline_action == "clone": clone_pipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.stop(logger=get_logger())
if len(sys.argv) < 2: print 'Usage: python graphslam_pipeline.py print,graph,run (task1,task2)' sys.exit(1) TORUN = [ ] if len(sys.argv) == 3: TORUN = sys.argv[2].split(',') CMDS = sys.argv[1].split(',') tasks = { 'print': lambda: pipeline_printout(sys.stdout, TORUN, forcedtorun_tasks=[], verbose=5), 'graph': lambda: pipeline_printout_graph('graph.jpg', 'jpg', TORUN, forcedtorun_tasks=[], no_key_legend=False), 'run': lambda: pipeline_run(TORUN, multiprocess=NUM_CPUS, one_second_per_job=False), 'force': lambda: pipeline_run([], forcedtorun_tasks=TORUN, multiprocess=NUM_CPUS, one_second_per_job=False), 'printf': lambda: pipeline_printout(sys.stdout, [], forcedtorun_tasks=TORUN, verbose=2), 'clean': clean, 'clean_pipelines': clean_pipelines }
def main(): args = get_cmdline_args() # We want to look for modules in the directory local to the pipeline, # just as if the pipeline script had been called directly. # This includes the script itself and the config files imported by getOptions sys.path.insert(0, os.path.dirname(args.pipeline)) # options must be set before pipeline is imported options = getOptions(args) setOptions(options) # import the pipeline so its stages are defined # the name of the pipeline is given on the command line __import__(drop_py_suffix(args.pipeline)) logDir = options.pipeline['logDir'] startLogger() pipelineOptions = options.pipeline endTasks = pipelineOptions['end'] forcedTasks = pipelineOptions['force'] style = pipelineOptions['style'] if pipelineOptions['rebuild'] == 'fromstart': rebuildMode = True elif pipelineOptions['rebuild'] == 'fromend': rebuildMode = False else: rebuildMode = True if style in ['run', 'touchfiles']: touchfiles_flag = (style == 'touchfiles') # Perform the pipeline steps (run the pipeline). pipeline_run( # End points of the pipeline. endTasks, # How many ruffus tasks to run. multiprocess=pipelineOptions['procs'], logger=black_hole_logger, # Force the pipeline to start from here, regardless of whether the # stage is up-to-date or not. forcedtorun_tasks=forcedTasks, # If the style was touchfiles, we will set a flag to bring # files up to date without running anything touch_files_only=touchfiles_flag, # Choose the mode in which ruffus decides how much work needs to be # done. gnu_make_maximal_rebuild_mode=rebuildMode) elif style == 'flowchart': # Draw the pipeline as a diagram. pipeline_printout_graph('flowchart.svg', 'svg', endTasks, no_key_legend=False) elif style == 'print': # Print a textual description of what the piplines would do, #but don't actuall run it. pipeline_printout(sys.stdout, endTasks, verbose=5, wrap_width=100000, forcedtorun_tasks=forcedTasks, gnu_make_maximal_rebuild_mode=rebuildMode)
def test_ruffus (self): print(" Run pipeline normally...") if self.graph_viz_present: pipeline_printout_graph(tempdir + "flowchart.dot", pipeline= "main") pipeline_printout_graph(tempdir + "flowchart.jpg", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True) pipeline_printout_graph(tempdir + "flowchart.svg", no_key_legend = False, pipeline= "main") # Unknown format try: pipeline_printout_graph(tempdir + "flowchart.unknown", no_key_legend = False, pipeline= "main") raise Exception("Failed to throw exception for pipeline_printout_graph unknown extension ") except CalledProcessError as err: pass pipeline_printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend = False, pipeline= "main") else: pipeline_printout_graph(tempdir + "flowchart.dot", target_tasks =[subdivide_start], forcedtorun_tasks = [split_start], no_key_legend = True, pipeline= "main")
def main(args=sys.argv): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ global GLOBAL_OPTIONS global GLOBAL_ARGS parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=("make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force running the pipeline even if there " "are uncommited changes " "in the repository [default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitly set paramater values " "[default=%default].") parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", type="string", help="RabbitMQ exchange to send log messages to " "[default=%default].") parser.add_option("--rabbitmq-host", dest="rabbitmq_host", type="string", help="RabbitMQ host to send log messages to " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.set_defaults(pipeline_action=None, pipeline_format="svg", pipeline_targets=[], multiprocess=40, logfile="pipeline.log", dry_run=False, force=False, log_exceptions=False, exceptions_terminate_immediately=False, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, rabbitmq_host="saruman", rabbitmq_exchange="ruffus_pipelines", input_validation=False) (options, args) = E.Start(parser, add_cluster_options=True) GLOBAL_OPTIONS, GLOBAL_ARGS = options, args E.info("Started in: %s" % PARAMS.get("workingdir")) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. PARAMS["dryrun"] = options.dry_run PARAMS["input_validation"] = options.input_validation # use cli_cluster_* keys in PARAMS to ensure highest priority # of cluster_* options passed with the command-line if options.cluster_memory_default is not None: PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default PARAMS["cluster_memory_default"] = options.cluster_memory_default if options.cluster_memory_resource is not None: PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource PARAMS["cluster_memory_resource"] = options.cluster_memory_resource if options.cluster_num_jobs is not None: PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs PARAMS["cluster_num_jobs"] = options.cluster_num_jobs if options.cluster_options is not None: PARAMS["cli_cluster_options"] = options.cluster_options PARAMS["cluster_options"] = options.cluster_options if options.cluster_parallel_environment is not None: PARAMS[ "cli_cluster_parallel_environment"] = options.cluster_parallel_environment PARAMS[ "cluster_parallel_environment"] = options.cluster_parallel_environment if options.cluster_priority is not None: PARAMS["cli_cluster_priority"] = options.cluster_priority PARAMS["cluster_priority"] = options.cluster_priority if options.cluster_queue is not None: PARAMS["cli_cluster_queue"] = options.cluster_queue PARAMS["cluster_queue"] = options.cluster_queue if options.cluster_queue_manager is not None: PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager PARAMS["cluster_queue_manager"] = options.cluster_queue_manager PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level for variables in options.variables_to_set: variable, value = variables.split("=") PARAMS[variable.strip()] = IOTools.str2val(value.strip()) if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) # see inputValidation function in Parameters.py if options.input_validation: inputValidation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: E.info("\t".join(map(str, requirement))) E.info("version check summary: %s" % str(counter)) E.Stop() return elif options.pipeline_action == "debug": # create the session proxy startSession() method_name = options.pipeline_targets[0] caller = getCaller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "svg", "plot", "touch", "regenerate"): # set up extra file logger handler = logging.FileHandler(filename=options.logfile, mode="a") handler.setFormatter( MultiLineFormatter( '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s' )) logger = logging.getLogger() logger.addHandler(handler) messenger = None try: if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = io.StringIO() pipeline_printout( stream, options.pipeline_targets, verbose=5, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterRabbitMQ( stream.getvalue(), project_name=getProjectName(), pipeline_name=getPipelineName(), host=options.rabbitmq_host, exchange=options.rabbitmq_exchange) logger.addFilter(messenger) if not options.without_cluster: global task # use threading instead of multiprocessing in order to # limit the number of concurrent jobs by using the # GIL # # Note that threading might cause problems with rpy. task.Pool = ThreadPool # create the session proxy startSession() # # make sure we are not logging at the same time in # different processes # # session_mutex = manager.Lock() E.info(E.GetHeader()) E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) E.info("Working directory is: %s" % PARAMS["workingdir"]) pipeline_run( options.pipeline_targets, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, ) E.info(E.GetFooter()) closeSession() elif options.pipeline_action == "show": pipeline_printout( options.stdout, options.pipeline_targets, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": pipeline_run(options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": pipeline_run(options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, verbose=options.loglevel) elif options.pipeline_action == "svg": pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus_exceptions.RethrownJobError as value: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(value.args)) for idx, e in enumerate(value.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) pass else: task = re.sub("__main__.", "", task) job = re.sub("\s", "", job) if messenger: messenger.send_error(task, job, error, msg) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.logfile) # write full traceback to log file only by removing the stdout # handler lhStdout = logger.handlers[0] logger.removeHandler(lhStdout) logger.error("start of error messages") logger.error(value) logger.error("end of error messages") logger.addHandler(lhStdout) # raise error raise ValueError("pipeline failed with %i errors" % len(value.args)) else: raise elif options.pipeline_action == "dump": print(json.dumps(PARAMS)) elif options.pipeline_action == "printconfig": print("Printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) printConfigFiles() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.Stop()
def run_pipeline(): #============================================================================================== # Config command line parser #============================================================================================== parser = OptionParser('nextgen_pipeline [option]... pipeline...') parser.add_option('-s', '--stage', dest='stage', help='stage of pipeline to run') parser.add_option('-t', '--threads', action='store', type='int', dest='threads', default=None, help='number of threads to use') parser.add_option('-j', '--jobs', action='store', type='int', dest='jobs', default=None, help='number of jobs to use') parser.add_option('-f', '--force', action='store_true', dest='force_run', default=False, help='Force pipeline to run stage') parser.add_option('-c', '--config', dest='config', default='pipeline.cfg', help='config file, defaults to pipeline.cfg') parser.add_option('-l', '--log', dest='log', default=None, help='path to log file') parser.add_option( '--graph', dest='print_graph', default=False, action='store_true', help='Print a graph of the pipeline rather than running it') (options, args) = parser.parse_args() if len(args) < 1: args = pipelines # Configuration parsing if not os.path.isfile(options.config): parser.error('Could not find config file: %s' % options.config) config = SafeConfigParser() config.read(options.config) for section in config.sections(): for option in config.options(section): CMD_DICT[option] = config.get(section, option) # get number of cpus on machine ncpus = multiprocessing.cpu_count() # load the pipeline(s) request by the user pipeline_stages = {} for arg in args: try: pipeline = __import__(PIPELINE_PATH + arg, globals(), locals(), ['*']) except ImportError, TypeError: # either no pipeline was requested or a missing/non-existant # pipeline was chosen print(parser.usage + '\n') show_pipeline_help(pipelines) pipeline_stages.update({arg: pipeline.stages_dict}) # did the user specify a stage if options.stage: if options.stage not in pipeline_stages[arg].keys(): # missing or non-existant stage chosen show_pipeline_stage_help() else: start_stage = pipeline_stages[arg][options.stage] else: # user did not specify a stage, use default start_stage = pipeline_stages[arg]['default'] # user specified job count, capped at the number of cpus if options.jobs: NUM_JOBS = options.jobs if options.jobs <= ncpus else ncpus else: NUM_JOBS = config.getint('jobs') \ if config.has_option('Threading', 'jobs') else ncpus / 2 # user specified log file if options.log: (head, _) = os.path.split(options.log) if os.path.exists(head): log_fn = options.log else: print "Unable to write to that log file." sys.exit(1) else: cwd = os.getcwd() now = datetime.datetime.now() ts = now.strftime('%Y-%m-%d_%H%M%S') log_fn = '%s/%s.%s.log' % (cwd, os.path.split(sys.argv[0])[1], ts) logger = quick_start_log(log_fn=log_fn) logger.debug('pipeline_run: %d jobs' % NUM_JOBS) # user said to force running of stage if options.print_graph: pipeline_printout_graph( 'pipeline.jpg', 'jpg', [start_stage], forcedtorun_tasks=([start_stage] if options.force_run else [])) else: pipeline_run( [start_stage], forcedtorun_tasks=([start_stage] if options.force_run else []), multiprocess=NUM_JOBS, logger=logger)
def main(): ######### # SETUP # ######### # catch jgi logon and password from cli parser = ruffus.cmdline.get_argparse( description='5 accessions variant calling pipeline.') parser.add_argument('--email', '-e', help='Logon email address for JGI', type=str, dest='jgi_logon') parser.add_argument('--password', '-p', help='JGI password', type=str, dest='jgi_password') options = parser.parse_args() jgi_logon = options.jgi_logon jgi_password = options.jgi_password ################## # PIPELINE STEPS # ################## # test function for checking input/output passed to job_script and parsing # by io_parser test_job_function = functions.generate_job_function( job_script='src/sh/io_parser', job_name='test') # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines["main"] # bamfiles raw_files = [x.path for x in os.scandir('data/bam') if x.name.endswith('.bam') and x.is_file] # subset the files while the pipeline is in development. Make this equal # to the raw_files to run the whole pipline. # active_raw_files = [x for x in raw_files if # 'G1' in x or 'G4' in x or 'J1' in x or 'J4' in x] active_raw_files = raw_files # species short names for vcf splitting species_short_names = list(set( [os.path.basename(x)[0] for x in active_raw_files])) # check that the files exist mapped_raw = main_pipeline.originate( name='mapped_raw', task_func=os.path.isfile, output=active_raw_files) # genome fasta ref_fa = main_pipeline.originate( name='ref_fa', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='ref_fa', job_type='download'), output='data/genome/Osativa_323_v7.0.fa', extras=[jgi_logon, jgi_password]) # indexes fa_idx = main_pipeline.transform( name='fa_idx', task_func=functions.generate_job_function( job_script='src/sh/fa_idx', job_name='fa_idx', job_type='transform', cpus_per_task=6), input=ref_fa, filter=ruffus.suffix(".fa"), output=['.dict', '.fa.fai']) # annotation annot = main_pipeline.originate( name='annot', task_func=functions.generate_job_function( job_script='src/sh/download_genome', job_name='annot', job_type='download'), output=('data/genome/' 'Osativa_323_v7.0.gene_exons.gffread.rRNAremoved.gtf'), extras=[jgi_logon, jgi_password]) # convert annotation to .bed annot_bed = main_pipeline.transform( name='annot_bed', task_func=functions.generate_job_function( job_script='src/sh/annot_bed', job_name='annot_bed', job_type='transform', cpus_per_task=7), input=annot, filter=ruffus.suffix('.gtf'), output='.bed') # mark duplicates with picard deduped = main_pipeline.transform( name='dedupe', task_func=functions.generate_job_function( job_script='src/sh/mark_duplicates_and_sort', job_name='dedupe', job_type='transform', cpus_per_task=2), input=mapped_raw, filter=ruffus.regex(r"data/bam/(.*).Aligned.out.bam"), output=(r"output/mark_duplicates_and_sort/\1.deduped.bam")) # Split'N'Trim and reassign mapping qualities split_and_trimmed = main_pipeline.transform( name='split_trim', task_func=functions.generate_job_function( job_script='src/sh/split_trim', job_name='split_trim', job_type='transform', cpus_per_task=2), input=deduped, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.formatter( "output/mark_duplicates_and_sort/(?P<LIB>.+).deduped.bam"), output=["{subdir[0][1]}/split_trim/{LIB[0]}.split.bam"])\ .follows(fa_idx) # we're going to recycle call_variants, merge_variants, filter_variants # and analyze_covar so we'll get the functions in advance call_variants = functions.generate_queue_job_function( job_script='src/sh/call_variants', job_name='call_variants') merge_variants = functions.generate_job_function( job_script='src/sh/merge_variants', job_name='merge_variants', job_type='transform', cpus_per_task=8) filter_variants = functions.generate_job_function( job_script='src/sh/filter_variants', job_name='filter_variants', job_type='transform', cpus_per_task=1) analyze_covar = functions.generate_queue_job_function( job_script='src/sh/analyze_covar', job_name='analyze_covar') # call variants without recalibration tables uncalibrated_variants = main_pipeline.transform( name='uncalibrated_variants', task_func=call_variants, input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, annot_bed]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/variants_uncalibrated/{LIB[0]}.g.vcf.gz') # merge gVCF variants uncalibrated_variants_merged = main_pipeline.merge( name='uncalibrated_variants_merged', task_func=merge_variants, input=[uncalibrated_variants, ref_fa], output='output/variants_uncalibrated/variants_uncalibrated.vcf.gz') # filter variants on un-corrected bamfiles uncalibrated_variants_filtered = main_pipeline.transform( name='uncalibrated_variants_filtered', task_func=filter_variants, input=uncalibrated_variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated.vcf.gz'), output='_uncalibrated_filtered.vcf.gz') # select variant (only recalibrate using passed SNPs) uncalibrated_variants_selected = main_pipeline.transform( name='uncalibrated_variants_selected', task_func=functions.generate_job_function( job_script='src/sh/select_variants', job_name='select_variants', job_type='transform'), input=uncalibrated_variants_filtered, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('_uncalibrated_filtered.vcf.gz'), output='_uncalibrated_selected.vcf.gz') # create recalibration report with filtered variants covar_report = main_pipeline.merge( name='covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_selected], output="output/covar_analysis/recal_data.table") # second pass to analyze covariation remaining after recalibration second_pass_covar_report = main_pipeline.merge( name='second_pass_covar_report', task_func=analyze_covar, input=[split_and_trimmed, ref_fa, annot_bed, uncalibrated_variants_filtered, covar_report], output="output/covar_analysis/post_recal_data.table") # plot effect of base recalibration recal_plot = main_pipeline.transform( name='recal_plot', task_func=functions.generate_job_function( job_script='src/R/recal_plot.R', job_name='recal_plot', job_type='transform', cpus_per_task=1), input=second_pass_covar_report, filter=ruffus.suffix('post_recal_data.table'), add_inputs=ruffus.add_inputs(covar_report), output='recalibration_plots.pdf') # recalibrate bases using recalibration report recalibrated = main_pipeline.transform( name='recalibrate', task_func=functions.generate_job_function( job_script='src/sh/recalibrate', job_name='recalibrate', job_type='transform', cpus_per_task=2), input=split_and_trimmed, add_inputs=ruffus.add_inputs([ref_fa, covar_report]), filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'), output='{subdir[0][1]}/recal/{LIB[0]}.recal.bam') # final variant calling variants = main_pipeline.transform( name='variants', task_func=call_variants, input=recalibrated, add_inputs=ruffus.add_inputs(ref_fa, annot_bed), filter=ruffus.formatter('output/recal/(?P<LIB>.+).recal.bam'), output='{subdir[0][1]}/variants/{LIB[0]}.g.vcf.gz') # merge gVCF variants variants_merged = main_pipeline.merge( name='variants_merged', task_func=merge_variants, input=[variants, ref_fa], output='output/variants/variants.vcf.gz') # variant filtering variants_filtered = main_pipeline.transform( name='variants_filtered', task_func=filter_variants, input=variants_merged, add_inputs=ruffus.add_inputs(ref_fa), filter=ruffus.suffix('.vcf.gz'), output='_filtered.vcf.gz') # variants by species split_variants = main_pipeline.subdivide( name='split_variants', task_func=functions.generate_job_function( job_script='src/sh/split_variants', job_name='split_variants', job_type='transform', cpus_per_task=1, ntasks=len(species_short_names)), input=variants_filtered, filter=ruffus.formatter(), add_inputs=ruffus.add_inputs(ref_fa), output=[('output/split_variants/' + x + '.variants_filtered.vcf.gz') for x in species_short_names]) # count variants per gene per species cds_variants = main_pipeline.transform( name='cds_variants', task_func=functions.generate_job_function( job_script='src/R/cds_variants.R', job_name='cds_variants', job_type='transform'), input=split_variants, add_inputs=ruffus.add_inputs([ref_fa, annot]), filter=ruffus.formatter( 'output/split_variants/(?P<LIB>.+).variants_filtered.vcf.gz'), output='{subdir[0][1]}/cds_variants/{LIB[0]}.cds_variants.Rds') # merge counted variants variants_per_gene = main_pipeline.merge( name='cds_merge', task_func=functions.generate_job_function( job_script='src/R/cds_merge.R', job_name='cds_merge', job_type='transform'), input=cds_variants, output='output/cds_variants/cds_variants.Rds') ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph( "ruffus/flowchart.pdf", "pdf", pipeline_name="5 accessions variant calling pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=8)
def main(): ######### # SETUP # ######### # catch jgi logon and password from cli parser = ruffus.cmdline.get_argparse( description='5 accessions variant calling pipeline.') parser.add_argument('--email', '-e', help='Logon email address for JGI', type=str, dest='jgi_logon') parser.add_argument('--password', '-p', help='JGI password', type=str, dest='jgi_password') options = parser.parse_args() jgi_logon = options.jgi_logon jgi_password = options.jgi_password ################## # PIPELINE STEPS # ################## # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines["main"] # test originate job test_originate_files = ['ruffus/foo.txt', 'ruffus/bar.txt'] test_originate = main_pipeline.originate( name='test_originate', task_func=functions.generate_job_function( job_script='src/test_originate', job_name='test_originate', job_type='originate'), output=test_originate_files) # test download job if not (jgi_logon and jgi_password): raise ValueError('Supply jgi_logon and jgi_password') test_download = main_pipeline.originate( name='test_download', task_func=functions.generate_job_function( job_script='src/test_download', job_name='test_download', job_type='download'), output='ruffus/download.txt', extras=[jgi_logon, jgi_password]) # test transform with multiple outputs (e.g. bamfile, FASTA etc) test_transform = main_pipeline.transform( name="test_transform", task_func=functions.generate_job_function( job_script='src/test_transform', job_name='test_transform', job_type='transform'), input=test_originate, filter=ruffus.suffix(".txt"), output=["_transformed.txt", "_transformed.bam"]) # Transform ONLY the bam files produced by test_transform # The filtering here is a bit crazy. `input` has to be an object, not # ruffus.output_from(). `replace_inputs` should use `ruffus.inputs()` to # match the files, but `filter` has to match the first file produced by # the previous step, NOT necessarily the file that will be transformed! test_selective_transform = main_pipeline.transform( name="test_selective_transform", task_func=functions.generate_job_function( job_script='src/test_selective_transform', job_name='test_selective_transform', job_type='transform'), input=test_transform, replace_inputs=ruffus.inputs(r"\1.bam"), filter=ruffus.suffix(".txt"), output=".bof") test_merge = main_pipeline.merge( name='test_merge', task_func=functions.generate_job_function( job_script='src/test_merge', job_name='test_merge', job_type='merge'), input=test_transform, output='ruffus/foobar_merge.txt' ) ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph( "ruffus/flowchart.pdf", "pdf", pipeline_name="Ruffus proforma pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=8)
traceback.print_exc() @ruffus.follows(*[kronos_component_docker_TASK_A_function, ]) def __last_task___function(): pass #================================================================================ #main body #-------------------------------------------------------------------------------- try: if not args.print_only: ruffus.pipeline_run(__last_task___function, multithread=args.num_jobs, verbose=0) else: cwd = os.getcwd() os.chdir(rm.pipeline_dir) ruffus.pipeline_printout_graph(args.pipeline_name + '.' + args.extension, args.extension, [__last_task___function], draw_vertically = args.draw_vertically, no_key_legend = args.no_key_legend, user_colour_scheme = {'colour_scheme_index': 1, 'Pipeline': {'fontcolor': '"#FF3232"'}, 'Task to run': {'linecolor': '"#0044A0"'}, 'Key': {'fontcolor': 'Red', 'fillcolor': '"#F6F4F4"'}, 'Final target': {'fontcolor': 'black', 'fillcolor': '"#EFA03B"', 'dashed': 0}}) os.chdir(cwd) lrc = flushqueue(job_rcs) if all(rc == 0 for rc in lrc): EXIT_CODE = 0 else: EXIT_CODE = 98 except: exc_type, exc_obj, exc_tb = sys.exc_info() ##exception object is of type <class 'ruffus.ruffus_exceptions.RethrownJobError'>. ##exc_obj.args[0][3] gives the message in the original exception. if exc_obj.args[0][3] == '(breakpoint)': print 'breakpoint happened in %s pipeline' % (args.pipeline_name) ljm.kill_all()
def main(): # parse CLI parser = ruffus.cmdline.get_argparse( description='Vv mtDNA assembly pipeline') parser.add_argument('--email', '-e', help='Email address, reported to NCBI', type=str, dest='email') options = parser.parse_args() # store the email variable for logon if options.email: os.environ['NCBI_EMAIL'] = options.email # initialise pipeline main_pipeline = ruffus.Pipeline.pipelines['main'] # TEST FUNCTION test_job_function = tompltools.generate_job_function( job_script='src/sh/io_parser', job_name='test', verbose=True) # download COI seed file download_coi_fasta = main_pipeline.originate( name='download_coi_fasta.py', task_func=tompltools.generate_job_function( job_type='originate', job_script='src/py/download_coi_fasta.py', job_name='download_coi_fasta.py'), output='data/GU207861.1.fasta') # define files sample_list = 'data/samples.txt' with open(sample_list, 'r') as f: csvreader = csv.reader(f) next(csvreader) file_list = {x[0]: [x[1], x[2]] for x in csvreader} pe_filenames = file_list['pe'] mp_filenames = file_list['mp'] pe_files = find_all(pe_filenames, 'data') # filter out weird hidden directories, what are these anyway? pe_files_filtered = [x for x in pe_files if '/.' not in x] # load files into ruffus raw_fq_files = main_pipeline.originate(name='raw_fq_files', task_func=os.path.isfile, output=pe_files_filtered) # trim adaptors trim_bbduk = main_pipeline.merge( name='trim_bbduk', task_func=tompltools.generate_job_function( job_script='src/sh/trim_bbduk', job_name='trim_bbduk', cpus_per_task=8), input=raw_fq_files, output='output/trim_bbduk/pe_trimmed.fastq.gz') # subsample # something like ['bof' + str(i) for i in range(1,4)] number_of_repeats = 5 subsample_reads = main_pipeline.subdivide( name='subsample_reads', task_func=tompltools.generate_job_function( job_script='src/sh/subsample_reads', job_name='subsample_reads', ntasks=number_of_repeats), input=trim_bbduk, filter=ruffus.formatter(), output=([ 'output/subsample_reads/pe_trimmed_subsampled_' + str(i) + '.fastq.gz' for i in range(1, number_of_repeats + 1) ])) # run mitobim mitobim_quick = main_pipeline.transform( name='run_mitobim', task_func=tompltools.generate_job_function( job_script='src/sh/run_mitobim', job_name='run_mitobim'), input=subsample_reads, add_inputs=ruffus.add_inputs(download_coi_fasta), filter=ruffus.formatter( r'output/subsample_reads/pe_trimmed_subsampled_' '(?P<RN>\d).fastq.gz'), output='output/mitobim_quick_{RN[0]}/mitobim.log.txt') # re-fish with longest assembly find_longest_assembly = main_pipeline.originate( name='find_longest_assembly', task_func=tompltools.generate_job_function( job_type='originate', job_script='src/py/find_longest_assembly.py', job_name='find_longest_assembly'), output='output/longest_quick_scaffold.fasta')\ .follows(mitobim_quick) mitobim_full = main_pipeline.transform( name='mitobim_full', task_func=tompltools.generate_job_function( job_script='src/sh/run_mitobim', job_name='run_mitobim'), input=trim_bbduk, add_inputs=ruffus.add_inputs(find_longest_assembly), filter=ruffus.formatter(), output='output/mitobim_full/mitobim.log.txt') ################### # RUFFUS COMMANDS # ################### # print the flowchart ruffus.pipeline_printout_graph("ruffus/flowchart.pdf", "pdf", pipeline_name="Vv mtDNA assembly pipeline") # run the pipeline ruffus.cmdline.run(options, multithread=32)
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run cgatreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join(targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"]) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", iotools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = iotools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info( 'the report is available at %s' % os.path.abspath(os.path.join(params['report_html'], "contents.html")))
# tumour_pileup_gz=inputs[1], # output_prefix=output_prefix)) # execute(cmd1, flag=flag1) # cmd2 = 'bgzip -f {0} && tabix -p vcf {1}'.format( # snp_vcf.replace('.gz', ''), snp_vcf) # execute(cmd2, flag=flag2) # cmd3 = 'bgzip -f {0} && tabix -p vcf {1}'.format( # indel_vcf.replace('.gz', ''), indel_vcf) # execute(cmd3, flag=flag3) if __name__ == "__main__": for _ in R.pipeline_get_task_names (): print _ print os.environ['PWD'] print '=' * 79 # parser = R.cmdline.get_argparse( # description="krvarscan", # usage='require python-2.7.x', # version='0.1') # options = parser.parse_args() logger, logger_mutex = R.cmdline.setup_logging( __name__, OPTIONS.log_file, OPTIONS.verbose) logger.info(OPTIONS.verbose) # with logger_mutex: # logger.info("Look Ma. No hands") R.pipeline_printout_graph('lele.svg', draw_vertically=True) R.cmdline.run(OPTIONS)
# Running the task job.execute(curr_options, out_dir=out_dir) # Setting the attribute for the new function so that it can be # pickled setattr(__main__, func_name, curr_step) # Updating the in_job and the last suffix only if the tool produces # usable data if job.produce_usable_data(): in_job = curr_step last_suffix += ".{}".format(job.get_suffix()) # Adding the current job to the pipeline job_order.append(curr_step) # Printing the pipeline print("Running the pipeline...") pipeline_printout_graph("flowchart.{}".format(args.flowchart_format), args.flowchart_format, job_order) pipeline_run(job_order, verbose=0, multiprocess=args.nb_process, checksum_level=1) except KeyboardInterrupt: print("Cancelled by user", sys.stderr) sys.exit(0) except ProgramError as e: parser.error(e.message)
def main(): args = get_cmdline_args() # We want to look for modules in the directory local to the pipeline, # just as if the pipeline script had been called directly. # This includes the script itself and the config files imported by getOptions sys.path.insert(0, os.path.dirname(args.pipeline)) # options must be set before pipeline is imported options = getOptions(args) setOptions(options) # import the pipeline so its stages are defined # the name of the pipeline is given on the command line __import__(drop_py_suffix(args.pipeline)) logDir = options.pipeline['logDir'] startLogger() pipelineOptions = options.pipeline endTasks = pipelineOptions['end'] forcedTasks = pipelineOptions['force'] style = pipelineOptions['style'] if pipelineOptions['rebuild'] == 'fromstart': rebuildMode = True elif pipelineOptions['rebuild'] == 'fromend': rebuildMode = False else: rebuildMode = True if style in ['run', 'touchfiles']: touchfiles_flag = (style=='touchfiles') # Perform the pipeline steps (run the pipeline). pipeline_run( # End points of the pipeline. endTasks, # How many ruffus tasks to run. multiprocess=pipelineOptions['procs'], logger=black_hole_logger, # Force the pipeline to start from here, regardless of whether the # stage is up-to-date or not. forcedtorun_tasks=forcedTasks, # If the style was touchfiles, we will set a flag to bring # files up to date without running anything touch_files_only=touchfiles_flag, # Choose the mode in which ruffus decides how much work needs to be # done. gnu_make_maximal_rebuild_mode=rebuildMode) elif style == 'flowchart': # Draw the pipeline as a diagram. pipeline_printout_graph( 'flowchart.svg', 'svg', endTasks, no_key_legend=False) elif style == 'print': # Print a textual description of what the piplines would do, #but don't actuall run it. pipeline_printout( sys.stdout, endTasks, verbose=5, wrap_width=100000, forcedtorun_tasks=forcedTasks, gnu_make_maximal_rebuild_mode=rebuildMode)