def ruffus_main(options, args):
    'Main entry point for ruffus pipelines'
    if options.just_print:
        pipeline_printout(
            sys.stdout,
            options.target_tasks,
            options.forced_tasks,
            verbose=options.verbose)
    elif options.flowchart:
        pipeline_printout_graph(
            open(options.flowchart, "w"),
            # use flowchart file name extension to decide flowchart format
            #   e.g. svg, jpg etc.
            os.path.splitext(options.flowchart)[1][1:],
            options.target_tasks,
            options.forced_tasks,
            no_key_legend=not options.key_legend_in_graph)
    else:
        pipeline_run(
            options.target_tasks,
            options.forced_tasks,
            multiprocess=options.jobs,
            logger=main_logger,
            verbose=options.verbose,
            touch_files_only=options.touch_only)
Exemple #2
0
    def test_graphviz_dot(self):
        """Make sure annotations from graphviz appear in dot
        """

        if sys.hexversion >= 0x03000000:
            # everything is unicode in python3
            s = BytesIO()
        else:
            s = StringIO()

        pipeline_printout_graph(
            s,
            # use flowchart file name extension to decide flowchart format
            #   e.g. svg, jpg etc.
            "dot",
            [Final_target, Up_to_date_final_target], pipeline="main")
        self.assertTrue('[URL="http://cnn.com", color="#FF0000", fillcolor="#FFCCCC", fontcolor="#4B6000", height=1.5, label=<What is this?<BR/> What <FONT COLOR="red">is</FONT>this???>, pencolor="#FF0000", peripheries=5, shape=component, style=dashed]' in s.getvalue().decode())
def ruffus_main(options, args):
    'Main entry point for ruffus pipelines'
    if options.just_print:
        pipeline_printout(sys.stdout,
                          options.target_tasks,
                          options.forced_tasks,
                          verbose=options.verbose)
    elif options.flowchart:
        pipeline_printout_graph(
            open(options.flowchart, "w"),
            # use flowchart file name extension to decide flowchart format
            #   e.g. svg, jpg etc.
            os.path.splitext(options.flowchart)[1][1:],
            options.target_tasks,
            options.forced_tasks,
            no_key_legend=not options.key_legend_in_graph)
    else:
        pipeline_run(options.target_tasks,
                     options.forced_tasks,
                     multiprocess=options.jobs,
                     logger=main_logger,
                     verbose=options.verbose,
                     touch_files_only=options.touch_only)
Exemple #4
0
def main(args=sys.argv):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import CGAT.Pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    args : list
        List of command line arguments.

    """

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=USAGE)

    parser.add_option("--pipeline-action", dest="pipeline_action",
                      type="choice",
                      choices=(
                          "make", "show", "plot", "dump", "config", "clone",
                          "check", "regenerate", "printconfig"),
                      help="action to take [default=%default].")

    parser.add_option("--pipeline-format", dest="pipeline_format",
                      type="choice",
                      choices=("dot", "jpg", "svg", "ps", "png"),
                      help="pipeline format [default=%default].")

    parser.add_option("-n", "--dry-run", dest="dry_run",
                      action="store_true",
                      help="perform a dry run (do not execute any shell "
                      "commands) [default=%default].")

    parser.add_option("-f", "--force-output", dest="force",
                      action="store_true",
                      help="force running the pipeline even if there "
                      "are uncommited changes "
                      "in the repository [default=%default].")

    parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int",
                      help="number of parallel processes to use on "
                      "submit host "
                      "(different from number of jobs to use for "
                      "cluster jobs) "
                      "[default=%default].")

    parser.add_option("-e", "--exceptions", dest="log_exceptions",
                      action="store_true",
                      help="echo exceptions immediately as they occur "
                      "[default=%default].")

    parser.add_option("-i", "--terminate", dest="terminate",
                      action="store_true",
                      help="terminate immediately at the first exception "
                      "[default=%default].")

    parser.add_option("-d", "--debug", dest="debug",
                      action="store_true",
                      help="output debugging information on console, "
                      "and not the logfile "
                      "[default=%default].")

    parser.add_option("-s", "--set", dest="variables_to_set",
                      type="string", action="append",
                      help="explicitly set paramater values "
                      "[default=%default].")

    parser.add_option("-c", "--checksums", dest="ruffus_checksums_level",
                      type="int",
                      help="set the level of ruffus checksums"
                      "[default=%default].")

    parser.add_option("-t", "--is-test", dest="is_test",
                      action="store_true",
                      help="this is a test run"
                      "[default=%default].")

    parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange",
                      type="string",
                      help="RabbitMQ exchange to send log messages to "
                      "[default=%default].")

    parser.add_option("--rabbitmq-host", dest="rabbitmq_host",
                      type="string",
                      help="RabbitMQ host to send log messages to "
                      "[default=%default].")

    parser.add_option("--input-validation", dest="input_validation",
                      action="store_true",
                      help="perform input validation before starting "
                      "[default=%default].")

    parser.set_defaults(
        pipeline_action=None,
        pipeline_format="svg",
        pipeline_targets=[],
        multiprocess=40,
        logfile="pipeline.log",
        dry_run=False,
        force=False,
        log_exceptions=False,
        exceptions_terminate_immediately=False,
        debug=False,
        variables_to_set=[],
        is_test=False,
        ruffus_checksums_level=0,
        rabbitmq_host="saruman",
        rabbitmq_exchange="ruffus_pipelines",
        input_validation=False)

    (options, args) = E.Start(parser,
                              add_cluster_options=True)

    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    E.info("Started in: %s" % PARAMS.get("workingdir"))
    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.

    PARAMS["dryrun"] = options.dry_run
    PARAMS["input_validation"] = options.input_validation

    # use cli_cluster_* keys in PARAMS to ensure highest priority
    # of cluster_* options passed with the command-line
    if options.cluster_memory_default is not None:
        PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default
        PARAMS["cluster_memory_default"] = options.cluster_memory_default
    if options.cluster_memory_resource is not None:
        PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource
        PARAMS["cluster_memory_resource"] = options.cluster_memory_resource
    if options.cluster_num_jobs is not None:
        PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs
        PARAMS["cluster_num_jobs"] = options.cluster_num_jobs
    if options.cluster_options is not None:
        PARAMS["cli_cluster_options"] = options.cluster_options
        PARAMS["cluster_options"] = options.cluster_options
    if options.cluster_parallel_environment is not None:
        PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment
        PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment
    if options.cluster_priority is not None:
        PARAMS["cli_cluster_priority"] = options.cluster_priority
        PARAMS["cluster_priority"] = options.cluster_priority
    if options.cluster_queue is not None:
        PARAMS["cli_cluster_queue"] = options.cluster_queue
        PARAMS["cluster_queue"] = options.cluster_queue
    if options.cluster_queue_manager is not None:
        PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager
        PARAMS["cluster_queue_manager"] = options.cluster_queue_manager

    PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level

    for variables in options.variables_to_set:
        variable, value = variables.split("=")
        PARAMS[variable.strip()] = IOTools.str2val(value.strip())

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    # see inputValidation function in Parameters.py
    if options.input_validation:
        inputValidation(PARAMS, sys.argv[0])

    if options.pipeline_action == "check":
        counter, requirements = Requirements.checkRequirementsFromAllModules()
        for requirement in requirements:
            E.info("\t".join(map(str, requirement)))
        E.info("version check summary: %s" % str(counter))
        E.Stop()
        return

    elif options.pipeline_action == "debug":
        # create the session proxy
        startSession()

        method_name = options.pipeline_targets[0]
        caller = getCaller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "svg", "plot",
                                     "touch", "regenerate"):

        # set up extra file logger
        handler = logging.FileHandler(filename=options.logfile,
                                      mode="a")
        handler.setFormatter(
            MultiLineFormatter(
                '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s'))
        logger = logging.getLogger()
        logger.addHandler(handler)
        messenger = None

        try:
            if options.pipeline_action == "make":

                # get tasks to be done. This essentially replicates
                # the state information within ruffus.
                stream = io.StringIO()
                pipeline_printout(
                    stream,
                    options.pipeline_targets,
                    verbose=5,
                    checksum_level=options.ruffus_checksums_level)

                messenger = LoggingFilterRabbitMQ(
                    stream.getvalue(),
                    project_name=getProjectName(),
                    pipeline_name=getPipelineName(),
                    host=options.rabbitmq_host,
                    exchange=options.rabbitmq_exchange)

                logger.addFilter(messenger)

                if not options.without_cluster and HAS_DRMAA:
                    global task
                    # use threading instead of multiprocessing in order to
                    # limit the number of concurrent jobs by using the
                    # GIL
                    #
                    # Note that threading might cause problems with rpy.
                    task.Pool = ThreadPool

                    # create the session proxy
                    startSession()

                elif not options.without_cluster and not HAS_DRMAA:
                    E.critical("DRMAA API not found so cannot talk to a cluster.")
                    E.critical("Please use --local to run the pipeline"
                               " on this host: {}".format(os.uname()[1]))
                    sys.exit(-1)

                #
                #   make sure we are not logging at the same time in
                #   different processes
                #
                # session_mutex = manager.Lock()
                E.info(E.GetHeader())
                E.info("code location: %s" % PARAMS["pipeline_scriptsdir"])
                E.info("Working directory is: %s" % PARAMS["workingdir"])

                pipeline_run(
                    options.pipeline_targets,
                    multiprocess=options.multiprocess,
                    logger=logger,
                    verbose=options.loglevel,
                    log_exceptions=options.log_exceptions,
                    exceptions_terminate_immediately=options.exceptions_terminate_immediately,
                    checksum_level=options.ruffus_checksums_level,
                )

                E.info(E.GetFooter())

                closeSession()

            elif options.pipeline_action == "show":
                pipeline_printout(
                    options.stdout,
                    options.pipeline_targets,
                    verbose=options.loglevel,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "touch":
                pipeline_run(
                    options.pipeline_targets,
                    touch_files_only=True,
                    verbose=options.loglevel,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "regenerate":
                pipeline_run(
                    options.pipeline_targets,
                    touch_files_only=options.ruffus_checksums_level,
                    verbose=options.loglevel)

            elif options.pipeline_action == "svg":
                pipeline_printout_graph(
                    options.stdout.buffer,
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "plot":
                outf, filename = tempfile.mkstemp()
                pipeline_printout_graph(
                    os.fdopen(outf, "wb"),
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)
                execute("inkscape %s" % filename)
                os.unlink(filename)

        except ruffus_exceptions.RethrownJobError as value:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(value.args))
                for idx, e in enumerate(value.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                        pass
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub("\s", "", job)

                    if messenger:
                        messenger.send_error(task, job, error, msg)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.logfile)

                # write full traceback to log file only by removing the stdout
                # handler
                lhStdout = logger.handlers[0]
                logger.removeHandler(lhStdout)
                logger.error("start of error messages")
                logger.error(value)
                logger.error("end of error messages")
                logger.addHandler(lhStdout)

                # raise error
                raise ValueError(
                    "pipeline failed with %i errors" % len(value.args))
            else:
                raise

    elif options.pipeline_action == "dump":
        print(json.dumps(PARAMS))

    elif options.pipeline_action == "printconfig":
        print("Printing out pipeline parameters: ")
        for k in sorted(PARAMS):
            print(k, "=", PARAMS[k])
        printConfigFiles()

    elif options.pipeline_action == "config":
        f = sys._getframe(1)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        writeConfigFiles(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clonePipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.Stop()
Exemple #5
0
def main():
    # prepare the ruffus pipeline
    main_pipeline = ruffus.Pipeline.pipelines["main"]

    # catch jgi logon and password from cli
    parser = ruffus.cmdline.get_argparse(description='UV-B analysis pipeline.')
    parser.add_argument('--email', '-e',
                        help='Logon email address for JGI',
                        type=str,
                        dest='jgi_logon')
    parser.add_argument('--password', '-p',
                        help='JGI password',
                        type=str,
                        dest='jgi_password')
    options = parser.parse_args()
    jgi_logon = options.jgi_logon
    jgi_password = options.jgi_password

    # need a dictionary of species to genome URL and species to gff.
    # supply this in a text file
    fasta_urls = {}
    annotation_urls = {}
    with open('data/genomeUrls.txt') as tsv:
        genome_urls = csv.reader(tsv, delimiter='\t')
        next(genome_urls, None)
        for row in genome_urls:
            fasta_urls[row[0]] = row[1]
            annotation_urls[row[0]] = row[2]

    # iterate over fasta_urls keys to run jobs
    for species in fasta_urls.keys():
        # call download script
        main_pipeline.originate(
            name=species + "_genome",
            task_func=download_genome,
            output="data/genome/" + species + "/METADATA.csv",
            extras=[species, fasta_urls[species], annotation_urls[species],
                    jgi_logon, jgi_password])
        # generate a star genome for each species
        main_pipeline.transform(
            name=species + "_index",
            task_func=generate_index,
            input=ruffus.output_from(species + "_genome"),
            filter=ruffus.regex(r"data/genome/(.*)/METADATA.csv"),
            output=r"output/\1/star-index/METADATA.csv",
            extras=[r"\1"])
        # define the reads
        main_pipeline.originate(name=species + "_reads",
                                task_func=define_reads,
                                output="ruffus/" + species + "_reads",
                                extras=[species])
        # first mapping step
        main_pipeline.collate(
            name=species + "_mapped_reads",
            task_func=star,
            input=[[ruffus.output_from(species + "_reads"),
                    ruffus.output_from(species + "_index")]],
            filter=ruffus.formatter(),
            output=["output/{subdir[1][1]}/star/METADATA.csv"],
            extras=["{subdir[1][1]}"])
    # FOR LOOP ENDS

    # parse the mapping stats
    mapping_stats = main_pipeline.merge(
        task_func=parse_star_stats_R,
        input=ruffus.output_from(
            list(species + "_mapped_reads" for species in fasta_urls.keys())),
        output="output/mapping_stats/SessionInfo.txt")

    # generate plots for mapping
    mapping_plots = main_pipeline.transform(
        task_func=plot_reads_in_genes_R,
        input=mapping_stats,
        filter=ruffus.formatter(),
        output="{subpath[0][0]}/Figure S1.pdf")

    # use generator in the input field to collate the previous results
    deseq_results = main_pipeline.transform(
                task_func=deseq2_R,
                input=ruffus.output_from(
                        list(species + "_mapped_reads"
                             for species in fasta_urls.keys())),
                filter=ruffus.formatter(),
                output=[r"output/{subdir[0][1]}/deseq2/SessionInfo.txt"],
                extras=[r"{subdir[0][1]}"])

    # combine the deseq results
    de_lists = main_pipeline.merge(
        task_func=list_de_genes_R,
        input=deseq_results,
        output="output/merged/deseq2/SessionInfo.de_genes.txt")

    # run clustering
    mfuzz_results = main_pipeline.transform(
        task_func=mfuzz_R,
        input=deseq_results,
        filter=ruffus.formatter(),
        output='output/{subdir[0][1]}/mfuzz/SessionInfo.mfuzz.txt',
        extras=['{subdir[0][1]}'])

    # combine mfuzz_results
    mfuzz_plot = main_pipeline.merge(
        task_func=combine_mfuzz_results_R,
        input=mfuzz_results,
        output='output/merged/mfuzz/SessionInfo.mfuzz.txt')

    # compare flavonoid synthesis genes
    flavonoid_genes = main_pipeline.transform(
        task_func=compare_saito_genes_R,
        input=de_lists,
        filter=ruffus.formatter(),
        output='{path[0]}/SessionInfo.flavonoid_synthesis.txt')

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)

    # print the flowchart
    ruffus.pipeline_printout_graph("ruffus/flowchart.pdf", "pdf",
                                   pipeline_name="UV-B analysis pipeline")
Exemple #6
0
def run_workflow(options, args, pipeline=None):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    pipeline: object
        pipeline to run. If not given, all ruffus pipelines are run.

    """
    logger = logging.getLogger("cgatcore.pipeline")

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    if options.force_run:
        if options.force_run == "all":
            forcedtorun_tasks = ruffus.pipeline_get_task_names()
        else:
            forcedtorun_tasks = options.pipeline_targets
    else:
        forcedtorun_tasks = []

    # create local scratch if it does not already exists. Note that
    # directory itself will be not deleted while its contents should
    # be cleaned up.
    if not os.path.exists(get_params()["tmpdir"]):
        logger.warn(
            "local temporary directory {} did not exist - created".format(
                get_params()["tmpdir"]))
        try:
            os.makedirs(get_params()["tmpdir"])
        except OSError:
            # file exists
            pass

    logger.debug("temporary directory is {}".format(get_params()["tmpdir"]))

    # set multiprocess to a sensible setting if there is no cluster
    run_on_cluster = HAS_DRMAA is True and not options.without_cluster
    if options.multiprocess is None:
        if not run_on_cluster:
            options.multiprocess = int(
                math.ceil(multiprocessing.cpu_count() / 2.0))
        else:
            options.multiprocess = 40

    # see inputValidation function in Parameters.py
    if options.input_validation:
        input_validation(get_params(), sys.argv[0])

    elif options.pipeline_action == "debug":
        # create the session proxy
        start_session()

        method_name = options.pipeline_targets[0]
        caller = get_caller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "state", "svg", "plot",
                                     "dot", "touch", "regenerate"):

        messenger = None
        try:
            with cache_os_functions():
                if options.pipeline_action == "make":

                    if not options.without_cluster and not HAS_DRMAA and not get_params(
                    )['testing']:
                        E.critical(
                            "DRMAA API not found so cannot talk to a cluster.")
                        E.critical("Please use --local to run the pipeline"
                                   " on this host: {}".format(os.uname()[1]))
                        sys.exit(-1)

                    # get tasks to be done. This essentially replicates
                    # the state information within ruffus.
                    stream = StringIO()
                    ruffus.pipeline_printout(
                        stream,
                        options.pipeline_targets,
                        verbose=5,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                    messenger = LoggingFilterProgress(stream.getvalue())
                    logger.addFilter(messenger)

                    global task
                    if options.without_cluster:
                        # use ThreadPool to avoid taking multiple CPU for pipeline
                        # controller.
                        opts = {"multithread": options.multiprocess}
                    else:
                        # use cooperative multitasking instead of multiprocessing.
                        opts = {
                            "multiprocess": options.multiprocess,
                            "pool_manager": "gevent"
                        }
                        # create the session proxy
                        start_session()

                    logger.info("current directory is {}".format(os.getcwd()))

                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        logger=logger,
                        verbose=options.loglevel,
                        log_exceptions=options.log_exceptions,
                        exceptions_terminate_immediately=options.
                        exceptions_terminate_immediately,
                        checksum_level=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        one_second_per_job=False,
                        **opts)

                    close_session()

                elif options.pipeline_action == "show":
                    ruffus.pipeline_printout(
                        options.stdout,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "touch":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=True,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "regenerate":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        verbose=options.loglevel)

                elif options.pipeline_action == "svg":
                    ruffus.pipeline_printout_graph(
                        options.stdout.buffer,
                        options.pipeline_format,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "state":
                    ruffus.ruffus_return_dag(
                        options.stdout,
                        target_tasks=options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "plot":
                    outf, filename = tempfile.mkstemp()
                    ruffus.pipeline_printout_graph(
                        os.fdopen(outf, "wb"),
                        options.pipeline_format,
                        options.pipeline_targets,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)
                    execute("inkscape %s" % filename)
                    os.unlink(filename)

        except ruffus.ruffus_exceptions.RethrownJobError as ex:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(ex.args))
                for idx, e in enumerate(ex.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub(r"\s", "", job)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.pipeline_logfile)

                logger.error("start of all error messages")
                logger.error(ex)
                logger.error("end of all error messages")

                raise ValueError("pipeline failed with %i errors" %
                                 len(ex.args)) from ex
            else:
                raise

    elif options.pipeline_action == "dump":
        options.stdout.write((json.dumps(get_params())) + "\n")

    elif options.pipeline_action == "printconfig":
        E.info("printing out pipeline parameters: ")
        p = get_params()
        for k in sorted(get_params()):
            print(k, "=", p[k])
        print_config_files()

    elif options.pipeline_action == "config":
        # Level needs to be 2:
        # 0th level -> cgatflow.py
        # 1st level -> Control.py
        # 2nd level -> pipeline_xyz.py
        f = sys._getframe(2)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        write_config_files(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clone_pipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.stop(logger=get_logger())
    def test_ruffus(self):

        print("     Run pipeline normally...")
        if self.graph_viz_present:
            pipeline_printout_graph(tempdir + "flowchart.dot", pipeline="main")
            pipeline_printout_graph(tempdir + "flowchart.jpg",
                                    target_tasks=[subdivide_start],
                                    forcedtorun_tasks=[split_start],
                                    no_key_legend=True)
            pipeline_printout_graph(tempdir + "flowchart.svg",
                                    no_key_legend=False,
                                    pipeline="main")
            # Unknown format
            try:
                pipeline_printout_graph(tempdir + "flowchart.unknown",
                                        no_key_legend=False,
                                        pipeline="main")
                raise Exception(
                    "Failed to throw exception for pipeline_printout_graph unknown extension "
                )
            except CalledProcessError as err:
                pass
            pipeline_printout_graph(tempdir + "flowchart.unknown",
                                    "svg",
                                    no_key_legend=False,
                                    pipeline="main")

        else:
            pipeline_printout_graph(tempdir + "flowchart.dot",
                                    target_tasks=[subdivide_start],
                                    forcedtorun_tasks=[split_start],
                                    no_key_legend=True,
                                    pipeline="main")
Exemple #8
0
def main():

    #########
    # SETUP #
    #########

    # test function for checking input/output passed to job_script and parsing
    # by src/sh/io_parser
    test_job_function = tompltools.generate_job_function(
        job_script='src/sh/io_parser',
        job_name='test',
        verbose=True)

    # parse email etc. here?
    parser = ruffus.cmdline.get_argparse(
        description='ASW genome assembly pipeline.')
    parser.add_argument('--blast-db-folder',
                        help='Path to BLAST db folder',
                        type=str,
                        dest='blast_db_folder')

    # parser.add_argument('--email', '-e',
    #                     help='Logon email address for JGI',
    #                     type=str,
    #                     dest='jgi_logon')
    # parser.add_argument('--password', '-p',
    #                     help='JGI password',
    #                     type=str,
    #                     dest='jgi_password')
    options = parser.parse_args()
    # jgi_logon = options.jgi_logon
    # jgi_password = options.jgi_password
    if options.blast_db_folder:
        os.environ['BLASTDB'] = options.blast_db_folder

    # initialise pipeline
    main_pipeline = ruffus.Pipeline.pipelines['main']

    # find fastq.gz files
    dir_listing = [x[0] for x in os.walk(top='data', followlinks=True)]
    fastq_file_list = []
    for directory in dir_listing:
        file_list = os.scandir(directory)
        fastq_file_list.append([x.path for x in file_list
                               if (x.name.endswith('fastq.gz')
                                   or x.name.endswith('.fastq'))
                               and x.is_file()])

    fastq_files = list(tompytools.flatten_list(fastq_file_list))

    # extract only MH gDNA fastq data, i.e.
    # 2125-06-11-1 = MH PE
    # 2125-06-06-1 = MH MP
    active_fq_files = [x for x in fastq_files
                       if ('2125-06-11-1' in x
                           or '2125-06-06-1' in x)]

    # load files into ruffus
    raw_fq_files = main_pipeline.originate(
        name='raw_fq_files',
        task_func=os.path.isfile,
        output=active_fq_files)

    # merge libraries
    merged_fq_files = main_pipeline.collate(
        name='merged_fq_files',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/merge_fq',
            job_name='merge_fq'),
        input=raw_fq_files,
        filter=ruffus.formatter(
            r'data/NZGL02125/.*/'
            '[^-]+-(?P<LIB>[^_]+).+_R(?P<RN>\d)_.*.fastq.gz'),
        output=[r'output/fq_merged/{LIB[0]}_R{RN[0]}_merged.fastq.gz'])


    # make pairs and send to cutadapt for trimming external adaptors
    trim_cutadapt = main_pipeline.collate(
        name='trim_cutadapt',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/cutadapt_pe',
            job_name='cutadapt'),
        input=merged_fq_files,
        filter=ruffus.formatter(
            r'.+/(?P<LIB>[^_]+)_R(?P<RN>\d)_merged.fastq.gz'),
        output=[['output/cutadapt/{LIB[0]}_R1_trimmed.fastq.gz',
                'output/cutadapt/{LIB[0]}_R2_trimmed.fastq.gz']])

    # send trimmed reads to splitnextera
    mp_splitnextera = main_pipeline.subdivide(
        name='splitnextera',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/splitnextera',
            job_name='splitnextera'),
        input=trim_cutadapt,
        filter=ruffus.regex(
            r'.+?/2125-06-06-1_R(?P<RN>\d)_trimmed.fastq.gz'),
        output=['output/splitnextera/2125-06-06-1.pe.fastq.gz',
                'output/splitnextera/2125-06-06-1.se.fastq.gz',
                'output/splitnextera/2125-06-06-1.mp.fastq.gz',
                'output/splitnextera/2125-06-06-1.unknown.fastq.gz'])

    # decontaminate PhiX (other?) sequences
    decon_mp = main_pipeline.transform(
        name='decon_mp',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/decon',
            job_name='decon_mp'),
        input=mp_splitnextera,
        filter=ruffus.formatter(
            r'.+/2125-06-06-1\.(?P<VL>[^.]+)\.fastq.gz'),
        output=['output/decon/2125-06-06-1_{VL[0]}.fastq.gz'])

    decon_pe = main_pipeline.transform(
        name='decon_pe',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/decon',
            job_name='decon_pe'),
        input=trim_cutadapt,
        filter=ruffus.regex(
            r'.+?/2125-06-11-1_R(?P<RN>\d)_trimmed.fastq.gz'),
        output=[r'output/decon/2125-06-11-1.fastq.gz'])

    decon = [decon_mp, decon_pe]

    # digital normalisation and error correction w/ bbnorm
    bbnorm = main_pipeline.subdivide(
        name='bbnorm',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/bbnorm',
            job_name='bbnorm',
            mem_per_cpu=7000,
            cpus_per_task=8),
        input=decon,
        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        output=[r'output/bbnorm/{LN[0]}{VL[0]}.fastq.gz'])

    # download NCBI databases for taxonomy data
    download_taxonomy_databases = main_pipeline.originate(
        name='download_taxonomy_databases',
        task_func=tompltools.generate_job_function(
            job_script='src/r/download_taxonomy_databases.R',
            job_name='download_taxonomy_databases',
            job_type='originate'),
        output=[['data/ncbi/nucl_gb.accession2taxid.Rds',
                'data/ncbi/nodes.dmp.Rds',
                'data/ncbi/names.dmp.Rds']])

    # subsample reads, blast with biopython and parse results
    fq_subsample = main_pipeline.subdivide(
        name='fq_subsample',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/fq_subsample',
            job_name='fq_subsample'),
        input=bbnorm,
        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        output=[r'output/blastqc/{LN[0]}{VL[0]}_R1.fastq.gz',
                r'output/blastqc/{LN[0]}{VL[0]}_R2.fastq.gz'])
    blast_reads = main_pipeline.transform(
        name='blast_reads',
        task_func=tompltools.generate_job_function(
            job_script='src/py/blast_reads.py',
            job_name='blast_reads',
            cpus_per_task=4),
        input=fq_subsample,
        filter=ruffus.suffix('.fastq.gz'),
        output=['.xml'])
    parse_blast_results = main_pipeline.transform(
        name='parse_blast_results',
        task_func=tompltools.generate_job_function(
            job_script='src/py/parse_blast_results.py',
            job_name='parse_blast_results'),
        input=blast_reads,
        filter=ruffus.suffix('.xml'),
        output=['.table'])
    main_pipeline.merge(
        name='plot_blast_resuts',
        task_func=tompltools.generate_job_function(
            job_script='src/r/extract_blast_hits_per_taxid.R',
            job_name='plot_blast_resuts'),
        input=[parse_blast_results, download_taxonomy_databases],
        output=['output/blastqc/plots.pdf'])
    
    # trim reads to 100 bp for edena?
    clip_to_100b = main_pipeline.subdivide(
        name='clip_to_100b',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/clip_to_100b',
            job_name='clip_to_100b'),
        input=bbnorm,
#        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        filter=ruffus.regex(r'.+/2125-06-11-1.fastq.gz'),
        output=[r'output/trunc_100/2125-06-11-1_R1.fastq.gz',
                r'output/trunc_100/2125-06-11-1_R2.fastq.gz'])

    # print raw and normalised kmer distribution plots
    main_pipeline.merge(
        name='kmer_distribution_plots',
        task_func=tompltools.generate_job_function(
            job_script='src/r/kmer_distribution_plots.R',
            job_name='kmer_distribution_plots'),
        input=bbnorm,
        output=['output/bbnorm/plots.pdf', 'output/bbnorm/plot_data.Rds'])

    # run fastqc on decontaminated and normalised libraries
    main_pipeline.transform(
        name='fastqc',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/fastqc',
            job_name='fastqc',
            cpus_per_task=1),
        input=bbnorm,
        filter=ruffus.formatter(r'.+/(?P<LN>[^(_|.)]+)(?P<VL>_?\w*).fastq.gz'),
        output=[r'output/fastqc/{LN[0]}{VL[0]}_fastqc.html'])

    # overlap step with edena
    # edena_overlaps = main_pipeline.collate(
    #     name='edena_overlaps',
    #     task_func=tompltools.generate_job_function(
    #         job_script='src/sh/edena_overlaps',
    #         job_name='edena_overlaps'),
    #     input=clip_to_100b,
    #     filter=ruffus.formatter(r'.+/(?P<LN>[^_]+)_R\d.fastq.gz'),
    #     output=[r'output/edena/{LN[0]}.ovc'])

    # prepare files with velveth
    # set threads for velvet to 1 !!!
    min_kmer = 71
    max_kmer = 87
    step = 8
    kmer_lengths = [x for x in range(min_kmer, max_kmer + 1, step)]
    velveth_output = list(
        tompytools.flatten_list(
            [('output/velveth_' + str(x) + '/Sequences')
             for x in kmer_lengths]))
    # velveth = main_pipeline.merge(
    #     name='hash_files',
    #     task_func=test_job_function,
    #     # task_func=tompltools.generate_job_function(
    #     #     job_script='src/sh/hash_files',
    #     #     job_name='hash_files'),
    #     input=bbnorm,
    #     output=velveth_output)

    ###################
    # RUFFUS COMMANDS #
    ###################

    # print the flowchart
    ruffus.pipeline_printout_graph(
        "ruffus/flowchart.pdf", "pdf",
        pipeline_name="ASW genome assembly pipeline")

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)
Exemple #9
0
def run_pipeline():
    #==============================================================================================
    # Config command line parser
    #==============================================================================================
    parser = OptionParser('nextgen_pipeline [option]... pipeline...')
    parser.add_option('-s', '--stage', dest='stage', help='stage of pipeline to run')
    parser.add_option('-t', '--threads', action='store', type='int', dest='threads',
                      default=None, help='number of threads to use')
    parser.add_option('-j', '--jobs', action='store', type='int', dest='jobs',
                      default=None, help='number of jobs to use')
    parser.add_option('-f', '--force', action='store_true', dest='force_run',
                      default=False, help='Force pipeline to run stage')
    parser.add_option('-c', '--config', dest='config', default='pipeline.cfg',
                      help='config file, defaults to pipeline.cfg')
    parser.add_option('-l', '--log', dest='log', default=None, help='path to log file')
    parser.add_option('--graph', dest='print_graph', default=False, action='store_true',
                      help='Print a graph of the pipeline rather than running it')

    (options, args) = parser.parse_args()
    if len(args) < 1:
        args = pipelines

    # Configuration parsing
    if not os.path.isfile(options.config):
        parser.error('Could not find config file: %s' % options.config)
    config = SafeConfigParser()
    config.read(options.config)

    for section in config.sections():
        for option in config.options(section):
            CMD_DICT[option] = config.get(section, option)

    # get number of cpus on machine
    ncpus = multiprocessing.cpu_count()

    # load the pipeline(s) request by the user
    pipeline_stages = {}
    for arg in args:
        try:
            pipeline = __import__(PIPELINE_PATH + arg, globals(), locals(), ['*'])
        except ImportError, TypeError:
            # either no pipeline was requested or a missing/non-existant
            # pipeline was chosen
            print(parser.usage + '\n')
            show_pipeline_help(pipelines)
        pipeline_stages.update({arg: pipeline.stages_dict})

        # did the user specify a stage
        if options.stage:
            if options.stage not in pipeline_stages[arg].keys():
                # missing or non-existant stage chosen
                show_pipeline_stage_help()
            else:
                start_stage = pipeline_stages[arg][options.stage]
        else:
            # user did not specify a stage, use default
            start_stage = pipeline_stages[arg]['default']

        # user specified job count, capped at the number of cpus
        if options.jobs:
            NUM_JOBS = options.jobs if options.jobs <= ncpus else ncpus
        else:
            NUM_JOBS = config.getint('jobs') \
                    if config.has_option('Threading', 'jobs') else ncpus / 2

        # user specified log file
        if options.log:
            (head, _) = os.path.split(options.log)
            if os.path.exists(head):
                log_fn = options.log
            else:
                print "Unable to write to that log file."
                sys.exit(1)
        else:
            cwd = os.getcwd()
            now = datetime.datetime.now()
            ts = now.strftime('%Y-%m-%d_%H%M%S')
            log_fn = '%s/%s.%s.log' % (cwd, os.path.split(sys.argv[0])[1], ts)

        logger = quick_start_log(log_fn=log_fn)

        logger.debug('pipeline_run: %d jobs' % NUM_JOBS)

        # user said to force running of stage
        if options.print_graph:
            pipeline_printout_graph('pipeline.jpg', 'jpg', [start_stage],
                    forcedtorun_tasks=([start_stage] if options.force_run else []))
        else:
            pipeline_run([start_stage], forcedtorun_tasks=([start_stage] if options.force_run else
                []), multiprocess=NUM_JOBS, logger=logger)
Exemple #10
0
def main(options, args, pipeline=None):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import Pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    options: object
        Container for command line arguments.
    args : list
        List of command line arguments.
    pipeline: object
        Pipeline to run. If not given, all ruffus pipelines are run.

    """

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS

    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    logger = logging.getLogger("daisy.pipeline")

    logger.info("started in workingdir: {}".format(PARAMS.get("workingdir")))
    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.
    update_params_with_commandline_options(PARAMS, options)

    version = get_version()

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    if options.force_run:
        if options.force_run == "all":
            forcedtorun_tasks = pipeline_get_task_names()
        else:
            forcedtorun_tasks = options.pipeline_targets
    else:
        forcedtorun_tasks = []

    # create local scratch if it does not already exists. Note that
    # directory itself will be not deleted while its contents should
    # be cleaned up.
    if not os.path.exists(PARAMS["tmpdir"]):
        logger.warn(
            "local temporary directory {} did not exist - created".format(
                PARAMS["tmpdir"]))
        try:
            os.makedirs(PARAMS["tmpdir"])
        except OSError:
            # file exists
            pass

    logger.debug("temporary directory is {}".format(PARAMS["tmpdir"]))

    # set multiprocess to a sensible setting if there is no cluster
    run_on_cluster = HAS_DRMAA is True and not options.without_cluster
    if options.multiprocess is None:
        if not run_on_cluster:
            options.multiprocess = int(
                math.ceil(multiprocessing.cpu_count() / 2.0))
        else:
            options.multiprocess = 40

    # see inputValidation function in Parameters.py
    if options.input_validation:
        input_validation(PARAMS, sys.argv[0])

    if options.pipeline_action == "check":
        counter, requirements = Requirements.checkRequirementsFromAllModules()
        for requirement in requirements:
            logger.info("\t".join(map(str, requirement)))
        logger.info("version check summary: %s" % str(counter))
        E.stop()
        return

    elif options.pipeline_action == "debug":
        # create the session proxy
        start_session()

        method_name = options.pipeline_targets[0]
        caller = get_caller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "state", "svg", "plot",
                                     "dot", "touch", "regenerate"):

        messenger = None
        try:
            with cache_os_functions():
                if options.pipeline_action == "make":

                    # get tasks to be done. This essentially replicates
                    # the state information within ruffus.
                    stream = StringIO()
                    ruffus.pipeline_printout(
                        stream,
                        options.pipeline_targets,
                        verbose=5,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                    messenger = LoggingFilterProgress(stream.getvalue())
                    logger.addFilter(messenger)

                    global task
                    if options.without_cluster:
                        # use ThreadPool to avoid taking multiple CPU for pipeline
                        # controller.
                        ruffus.task.Pool = ThreadPool
                    else:
                        # use cooperative multitasking instead of multiprocessing.
                        ruffus.task.Pool = EventPool
                        ruffus.task.queue = gevent.queue

                        # create the session proxy
                        start_session()

                    logger.info("code location: {}".format(
                        PARAMS["scriptsdir"]))
                    logger.info("code version: {}".format(version))
                    logger.info("working directory is: {}".format(
                        PARAMS["workingdir"]))
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        multiprocess=options.multiprocess,
                        logger=logger,
                        verbose=options.loglevel,
                        log_exceptions=options.log_exceptions,
                        exceptions_terminate_immediately=options.
                        exceptions_terminate_immediately,
                        checksum_level=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        one_second_per_job=False,
                    )

                    close_session()

                elif options.pipeline_action == "show":
                    ruffus.pipeline_printout(
                        options.stdout,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "touch":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=True,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "regenerate":
                    ruffus.pipeline_run(
                        options.pipeline_targets,
                        touch_files_only=options.ruffus_checksums_level,
                        pipeline=pipeline,
                        verbose=options.loglevel)

                elif options.pipeline_action == "svg":
                    ruffus.pipeline_printout_graph(
                        options.stdout.buffer,
                        options.pipeline_format,
                        options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "state":
                    ruffus.ruffus_return_dag(
                        options.stdout,
                        target_tasks=options.pipeline_targets,
                        forcedtorun_tasks=forcedtorun_tasks,
                        verbose=options.loglevel,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)

                elif options.pipeline_action == "plot":
                    outf, filename = tempfile.mkstemp()
                    ruffus.pipeline_printout_graph(
                        os.fdopen(outf, "wb"),
                        options.pipeline_format,
                        options.pipeline_targets,
                        pipeline=pipeline,
                        checksum_level=options.ruffus_checksums_level)
                    execute("inkscape %s" % filename)
                    os.unlink(filename)

        except ruffus.ruffus_exceptions.RethrownJobError as ex:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(ex.args))
                for idx, e in enumerate(ex.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub(r"\s", "", job)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.pipeline_logfile)

                logger.error("start of all error messages")
                logger.error(ex)
                logger.error("end of all error messages")

                raise ValueError("pipeline failed with %i errors" %
                                 len(ex.args)) from ex
            else:
                raise

    elif options.pipeline_action == "dump":
        options.stdout.write((json.dumps(PARAMS)) + "\n")

    elif options.pipeline_action == "printconfig":
        E.info("printing out pipeline parameters: ")
        for k in sorted(PARAMS):
            print(k, "=", PARAMS[k])
        print_config_files()

    elif options.pipeline_action == "config":
        f = sys._getframe(1)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        write_config_files(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clone_pipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.stop(logger=get_logger())
Exemple #11
0
    if len(sys.argv) < 2:
        print 'Usage: python graphslam_pipeline.py print,graph,run (task1,task2)'
        sys.exit(1)

    TORUN = [
    ]

    if len(sys.argv) == 3:
        TORUN = sys.argv[2].split(',')
    CMDS = sys.argv[1].split(',')

    tasks = {
        'print': lambda: pipeline_printout(sys.stdout, TORUN,
                                           forcedtorun_tasks=[], verbose=5),
        'graph': lambda: pipeline_printout_graph('graph.jpg', 'jpg', TORUN,
                                                 forcedtorun_tasks=[],
                                                 no_key_legend=False),
        'run': lambda: pipeline_run(TORUN,
                                    multiprocess=NUM_CPUS,
                                    one_second_per_job=False),
        'force': lambda: pipeline_run([],
                                      forcedtorun_tasks=TORUN,
                                      multiprocess=NUM_CPUS,
                                      one_second_per_job=False),
        'printf': lambda: pipeline_printout(sys.stdout,
                                            [],
                                            forcedtorun_tasks=TORUN,
                                            verbose=2),
        'clean': clean,
        'clean_pipelines': clean_pipelines
    }
Exemple #12
0
def main():

    args = get_cmdline_args()

    # We want to look for modules in the directory local to the pipeline,
    # just as if the pipeline script had been called directly.
    # This includes the script itself and the config files imported by getOptions
    sys.path.insert(0, os.path.dirname(args.pipeline))

    # options must be set before pipeline is imported
    options = getOptions(args)
    setOptions(options)

    # import the pipeline so its stages are defined
    # the name of the pipeline is given on the command line
    __import__(drop_py_suffix(args.pipeline))

    logDir = options.pipeline['logDir']
    startLogger()
    pipelineOptions = options.pipeline
    endTasks = pipelineOptions['end']
    forcedTasks = pipelineOptions['force']
    style = pipelineOptions['style']
    if pipelineOptions['rebuild'] == 'fromstart':
        rebuildMode = True
    elif pipelineOptions['rebuild'] == 'fromend':
        rebuildMode = False
    else:
        rebuildMode = True
    if style in ['run', 'touchfiles']:
        touchfiles_flag = (style == 'touchfiles')
        # Perform the pipeline steps (run the pipeline).
        pipeline_run(
            # End points of the pipeline.
            endTasks,
            # How many ruffus tasks to run.
            multiprocess=pipelineOptions['procs'],
            logger=black_hole_logger,
            # Force the pipeline to start from here, regardless of whether the
            # stage is up-to-date or not.
            forcedtorun_tasks=forcedTasks,
            # If the style was touchfiles, we will set a flag to bring
            # files up to date without running anything
            touch_files_only=touchfiles_flag,
            # Choose the mode in which ruffus decides how much work needs to be
            # done.
            gnu_make_maximal_rebuild_mode=rebuildMode)
    elif style == 'flowchart':
        # Draw the pipeline as a diagram.
        pipeline_printout_graph('flowchart.svg',
                                'svg',
                                endTasks,
                                no_key_legend=False)
    elif style == 'print':
        # Print a textual description of what the piplines would do,
        #but don't actuall run it.
        pipeline_printout(sys.stdout,
                          endTasks,
                          verbose=5,
                          wrap_width=100000,
                          forcedtorun_tasks=forcedTasks,
                          gnu_make_maximal_rebuild_mode=rebuildMode)
    def test_ruffus (self):

        print("     Run pipeline normally...")
        if self.graph_viz_present:
            pipeline_printout_graph(tempdir + "flowchart.dot", pipeline= "main")
            pipeline_printout_graph(tempdir + "flowchart.jpg",
                                        target_tasks =[subdivide_start],
                                        forcedtorun_tasks = [split_start],
                                        no_key_legend = True)
            pipeline_printout_graph(tempdir + "flowchart.svg", no_key_legend = False, pipeline= "main")
            # Unknown format
            try:
                pipeline_printout_graph(tempdir + "flowchart.unknown", no_key_legend = False, pipeline= "main")
                raise Exception("Failed to throw exception for pipeline_printout_graph unknown extension ")
            except CalledProcessError as err:
                pass
            pipeline_printout_graph(tempdir + "flowchart.unknown", "svg", no_key_legend = False, pipeline= "main")

        else:
            pipeline_printout_graph(tempdir + "flowchart.dot",
                                        target_tasks =[subdivide_start],
                                        forcedtorun_tasks = [split_start],
                                        no_key_legend = True,
                                        pipeline= "main")
Exemple #14
0
def main(args=sys.argv):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import CGAT.Pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    args : list
        List of command line arguments.

    """

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS

    parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--pipeline-action",
                      dest="pipeline_action",
                      type="choice",
                      choices=("make", "show", "plot", "dump", "config",
                               "clone", "check", "regenerate", "printconfig"),
                      help="action to take [default=%default].")

    parser.add_option("--pipeline-format",
                      dest="pipeline_format",
                      type="choice",
                      choices=("dot", "jpg", "svg", "ps", "png"),
                      help="pipeline format [default=%default].")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="perform a dry run (do not execute any shell "
                      "commands) [default=%default].")

    parser.add_option("-f",
                      "--force-output",
                      dest="force",
                      action="store_true",
                      help="force running the pipeline even if there "
                      "are uncommited changes "
                      "in the repository [default=%default].")

    parser.add_option("-p",
                      "--multiprocess",
                      dest="multiprocess",
                      type="int",
                      help="number of parallel processes to use on "
                      "submit host "
                      "(different from number of jobs to use for "
                      "cluster jobs) "
                      "[default=%default].")

    parser.add_option("-e",
                      "--exceptions",
                      dest="log_exceptions",
                      action="store_true",
                      help="echo exceptions immediately as they occur "
                      "[default=%default].")

    parser.add_option("-i",
                      "--terminate",
                      dest="terminate",
                      action="store_true",
                      help="terminate immediately at the first exception "
                      "[default=%default].")

    parser.add_option("-d",
                      "--debug",
                      dest="debug",
                      action="store_true",
                      help="output debugging information on console, "
                      "and not the logfile "
                      "[default=%default].")

    parser.add_option("-s",
                      "--set",
                      dest="variables_to_set",
                      type="string",
                      action="append",
                      help="explicitly set paramater values "
                      "[default=%default].")

    parser.add_option("-c",
                      "--checksums",
                      dest="ruffus_checksums_level",
                      type="int",
                      help="set the level of ruffus checksums"
                      "[default=%default].")

    parser.add_option("-t",
                      "--is-test",
                      dest="is_test",
                      action="store_true",
                      help="this is a test run"
                      "[default=%default].")

    parser.add_option("--rabbitmq-exchange",
                      dest="rabbitmq_exchange",
                      type="string",
                      help="RabbitMQ exchange to send log messages to "
                      "[default=%default].")

    parser.add_option("--rabbitmq-host",
                      dest="rabbitmq_host",
                      type="string",
                      help="RabbitMQ host to send log messages to "
                      "[default=%default].")

    parser.add_option("--input-validation",
                      dest="input_validation",
                      action="store_true",
                      help="perform input validation before starting "
                      "[default=%default].")

    parser.set_defaults(pipeline_action=None,
                        pipeline_format="svg",
                        pipeline_targets=[],
                        multiprocess=40,
                        logfile="pipeline.log",
                        dry_run=False,
                        force=False,
                        log_exceptions=False,
                        exceptions_terminate_immediately=False,
                        debug=False,
                        variables_to_set=[],
                        is_test=False,
                        ruffus_checksums_level=0,
                        rabbitmq_host="saruman",
                        rabbitmq_exchange="ruffus_pipelines",
                        input_validation=False)

    (options, args) = E.Start(parser, add_cluster_options=True)

    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    E.info("Started in: %s" % PARAMS.get("workingdir"))
    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.

    PARAMS["dryrun"] = options.dry_run
    PARAMS["input_validation"] = options.input_validation

    # use cli_cluster_* keys in PARAMS to ensure highest priority
    # of cluster_* options passed with the command-line
    if options.cluster_memory_default is not None:
        PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default
        PARAMS["cluster_memory_default"] = options.cluster_memory_default
    if options.cluster_memory_resource is not None:
        PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource
        PARAMS["cluster_memory_resource"] = options.cluster_memory_resource
    if options.cluster_num_jobs is not None:
        PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs
        PARAMS["cluster_num_jobs"] = options.cluster_num_jobs
    if options.cluster_options is not None:
        PARAMS["cli_cluster_options"] = options.cluster_options
        PARAMS["cluster_options"] = options.cluster_options
    if options.cluster_parallel_environment is not None:
        PARAMS[
            "cli_cluster_parallel_environment"] = options.cluster_parallel_environment
        PARAMS[
            "cluster_parallel_environment"] = options.cluster_parallel_environment
    if options.cluster_priority is not None:
        PARAMS["cli_cluster_priority"] = options.cluster_priority
        PARAMS["cluster_priority"] = options.cluster_priority
    if options.cluster_queue is not None:
        PARAMS["cli_cluster_queue"] = options.cluster_queue
        PARAMS["cluster_queue"] = options.cluster_queue
    if options.cluster_queue_manager is not None:
        PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager
        PARAMS["cluster_queue_manager"] = options.cluster_queue_manager

    PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level

    for variables in options.variables_to_set:
        variable, value = variables.split("=")
        PARAMS[variable.strip()] = IOTools.str2val(value.strip())

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    # see inputValidation function in Parameters.py
    if options.input_validation:
        inputValidation(PARAMS, sys.argv[0])

    if options.pipeline_action == "check":
        counter, requirements = Requirements.checkRequirementsFromAllModules()
        for requirement in requirements:
            E.info("\t".join(map(str, requirement)))
        E.info("version check summary: %s" % str(counter))
        E.Stop()
        return

    elif options.pipeline_action == "debug":
        # create the session proxy
        startSession()

        method_name = options.pipeline_targets[0]
        caller = getCaller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "svg", "plot", "touch",
                                     "regenerate"):

        # set up extra file logger
        handler = logging.FileHandler(filename=options.logfile, mode="a")
        handler.setFormatter(
            MultiLineFormatter(
                '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s'
            ))
        logger = logging.getLogger()
        logger.addHandler(handler)
        messenger = None

        try:
            if options.pipeline_action == "make":

                # get tasks to be done. This essentially replicates
                # the state information within ruffus.
                stream = io.StringIO()
                pipeline_printout(
                    stream,
                    options.pipeline_targets,
                    verbose=5,
                    checksum_level=options.ruffus_checksums_level)

                messenger = LoggingFilterRabbitMQ(
                    stream.getvalue(),
                    project_name=getProjectName(),
                    pipeline_name=getPipelineName(),
                    host=options.rabbitmq_host,
                    exchange=options.rabbitmq_exchange)

                logger.addFilter(messenger)

                if not options.without_cluster:
                    global task
                    # use threading instead of multiprocessing in order to
                    # limit the number of concurrent jobs by using the
                    # GIL
                    #
                    # Note that threading might cause problems with rpy.
                    task.Pool = ThreadPool

                    # create the session proxy
                    startSession()

                #
                #   make sure we are not logging at the same time in
                #   different processes
                #
                # session_mutex = manager.Lock()
                E.info(E.GetHeader())
                E.info("code location: %s" % PARAMS["pipeline_scriptsdir"])
                E.info("Working directory is: %s" % PARAMS["workingdir"])

                pipeline_run(
                    options.pipeline_targets,
                    multiprocess=options.multiprocess,
                    logger=logger,
                    verbose=options.loglevel,
                    log_exceptions=options.log_exceptions,
                    exceptions_terminate_immediately=options.
                    exceptions_terminate_immediately,
                    checksum_level=options.ruffus_checksums_level,
                )

                E.info(E.GetFooter())

                closeSession()

            elif options.pipeline_action == "show":
                pipeline_printout(
                    options.stdout,
                    options.pipeline_targets,
                    verbose=options.loglevel,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "touch":
                pipeline_run(options.pipeline_targets,
                             touch_files_only=True,
                             verbose=options.loglevel,
                             checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "regenerate":
                pipeline_run(options.pipeline_targets,
                             touch_files_only=options.ruffus_checksums_level,
                             verbose=options.loglevel)

            elif options.pipeline_action == "svg":
                pipeline_printout_graph(
                    options.stdout.buffer,
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "plot":
                outf, filename = tempfile.mkstemp()
                pipeline_printout_graph(
                    os.fdopen(outf, "wb"),
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)
                execute("inkscape %s" % filename)
                os.unlink(filename)

        except ruffus_exceptions.RethrownJobError as value:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(value.args))
                for idx, e in enumerate(value.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                        pass
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub("\s", "", job)

                    if messenger:
                        messenger.send_error(task, job, error, msg)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.logfile)

                # write full traceback to log file only by removing the stdout
                # handler
                lhStdout = logger.handlers[0]
                logger.removeHandler(lhStdout)
                logger.error("start of error messages")
                logger.error(value)
                logger.error("end of error messages")
                logger.addHandler(lhStdout)

                # raise error
                raise ValueError("pipeline failed with %i errors" %
                                 len(value.args))
            else:
                raise

    elif options.pipeline_action == "dump":
        print(json.dumps(PARAMS))

    elif options.pipeline_action == "printconfig":
        print("Printing out pipeline parameters: ")
        for k in sorted(PARAMS):
            print(k, "=", PARAMS[k])
        printConfigFiles()

    elif options.pipeline_action == "config":
        f = sys._getframe(1)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        writeConfigFiles(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clonePipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.Stop()
Exemple #15
0
def run_pipeline():
    #==============================================================================================
    # Config command line parser
    #==============================================================================================
    parser = OptionParser('nextgen_pipeline [option]... pipeline...')
    parser.add_option('-s',
                      '--stage',
                      dest='stage',
                      help='stage of pipeline to run')
    parser.add_option('-t',
                      '--threads',
                      action='store',
                      type='int',
                      dest='threads',
                      default=None,
                      help='number of threads to use')
    parser.add_option('-j',
                      '--jobs',
                      action='store',
                      type='int',
                      dest='jobs',
                      default=None,
                      help='number of jobs to use')
    parser.add_option('-f',
                      '--force',
                      action='store_true',
                      dest='force_run',
                      default=False,
                      help='Force pipeline to run stage')
    parser.add_option('-c',
                      '--config',
                      dest='config',
                      default='pipeline.cfg',
                      help='config file, defaults to pipeline.cfg')
    parser.add_option('-l',
                      '--log',
                      dest='log',
                      default=None,
                      help='path to log file')
    parser.add_option(
        '--graph',
        dest='print_graph',
        default=False,
        action='store_true',
        help='Print a graph of the pipeline rather than running it')

    (options, args) = parser.parse_args()
    if len(args) < 1:
        args = pipelines

    # Configuration parsing
    if not os.path.isfile(options.config):
        parser.error('Could not find config file: %s' % options.config)
    config = SafeConfigParser()
    config.read(options.config)

    for section in config.sections():
        for option in config.options(section):
            CMD_DICT[option] = config.get(section, option)

    # get number of cpus on machine
    ncpus = multiprocessing.cpu_count()

    # load the pipeline(s) request by the user
    pipeline_stages = {}
    for arg in args:
        try:
            pipeline = __import__(PIPELINE_PATH + arg, globals(), locals(),
                                  ['*'])
        except ImportError, TypeError:
            # either no pipeline was requested or a missing/non-existant
            # pipeline was chosen
            print(parser.usage + '\n')
            show_pipeline_help(pipelines)
        pipeline_stages.update({arg: pipeline.stages_dict})

        # did the user specify a stage
        if options.stage:
            if options.stage not in pipeline_stages[arg].keys():
                # missing or non-existant stage chosen
                show_pipeline_stage_help()
            else:
                start_stage = pipeline_stages[arg][options.stage]
        else:
            # user did not specify a stage, use default
            start_stage = pipeline_stages[arg]['default']

        # user specified job count, capped at the number of cpus
        if options.jobs:
            NUM_JOBS = options.jobs if options.jobs <= ncpus else ncpus
        else:
            NUM_JOBS = config.getint('jobs') \
                    if config.has_option('Threading', 'jobs') else ncpus / 2

        # user specified log file
        if options.log:
            (head, _) = os.path.split(options.log)
            if os.path.exists(head):
                log_fn = options.log
            else:
                print "Unable to write to that log file."
                sys.exit(1)
        else:
            cwd = os.getcwd()
            now = datetime.datetime.now()
            ts = now.strftime('%Y-%m-%d_%H%M%S')
            log_fn = '%s/%s.%s.log' % (cwd, os.path.split(sys.argv[0])[1], ts)

        logger = quick_start_log(log_fn=log_fn)

        logger.debug('pipeline_run: %d jobs' % NUM_JOBS)

        # user said to force running of stage
        if options.print_graph:
            pipeline_printout_graph(
                'pipeline.jpg',
                'jpg', [start_stage],
                forcedtorun_tasks=([start_stage] if options.force_run else []))
        else:
            pipeline_run(
                [start_stage],
                forcedtorun_tasks=([start_stage] if options.force_run else []),
                multiprocess=NUM_JOBS,
                logger=logger)
Exemple #16
0
def main():

    #########
    # SETUP #
    #########

    # catch jgi logon and password from cli
    parser = ruffus.cmdline.get_argparse(
        description='5 accessions variant calling pipeline.')
    parser.add_argument('--email', '-e',
                        help='Logon email address for JGI',
                        type=str,
                        dest='jgi_logon')
    parser.add_argument('--password', '-p',
                        help='JGI password',
                        type=str,
                        dest='jgi_password')
    options = parser.parse_args()
    jgi_logon = options.jgi_logon
    jgi_password = options.jgi_password

    ##################
    # PIPELINE STEPS #
    ##################

    # test function for checking input/output passed to job_script and parsing
    # by io_parser
    test_job_function = functions.generate_job_function(
        job_script='src/sh/io_parser',
        job_name='test')

    # initialise pipeline
    main_pipeline = ruffus.Pipeline.pipelines["main"]

    # bamfiles
    raw_files = [x.path for x in os.scandir('data/bam') if
                 x.name.endswith('.bam') and x.is_file]

    # subset the files while the pipeline is in development. Make this equal
    # to the raw_files to run the whole pipline.
    # active_raw_files = [x for x in raw_files if
    #                     'G1' in x or 'G4' in x or 'J1' in x or 'J4' in x]
    active_raw_files = raw_files

    # species short names for vcf splitting
    species_short_names = list(set(
        [os.path.basename(x)[0] for x in active_raw_files]))

    # check that the files exist
    mapped_raw = main_pipeline.originate(
        name='mapped_raw',
        task_func=os.path.isfile,
        output=active_raw_files)

    # genome fasta
    ref_fa = main_pipeline.originate(
        name='ref_fa',
        task_func=functions.generate_job_function(
            job_script='src/sh/download_genome',
            job_name='ref_fa',
            job_type='download'),
        output='data/genome/Osativa_323_v7.0.fa',
        extras=[jgi_logon, jgi_password])

    # indexes
    fa_idx = main_pipeline.transform(
        name='fa_idx',
        task_func=functions.generate_job_function(
            job_script='src/sh/fa_idx',
            job_name='fa_idx',
            job_type='transform',
            cpus_per_task=6),
        input=ref_fa,
        filter=ruffus.suffix(".fa"),
        output=['.dict', '.fa.fai'])

    # annotation
    annot = main_pipeline.originate(
        name='annot',
        task_func=functions.generate_job_function(
            job_script='src/sh/download_genome',
            job_name='annot',
            job_type='download'),
        output=('data/genome/'
                'Osativa_323_v7.0.gene_exons.gffread.rRNAremoved.gtf'),
        extras=[jgi_logon, jgi_password])

    # convert annotation to .bed
    annot_bed = main_pipeline.transform(
        name='annot_bed',
        task_func=functions.generate_job_function(
            job_script='src/sh/annot_bed',
            job_name='annot_bed',
            job_type='transform',
            cpus_per_task=7),
        input=annot,
        filter=ruffus.suffix('.gtf'),
        output='.bed')

    # mark duplicates with picard
    deduped = main_pipeline.transform(
        name='dedupe',
        task_func=functions.generate_job_function(
            job_script='src/sh/mark_duplicates_and_sort',
            job_name='dedupe',
            job_type='transform',
            cpus_per_task=2),
        input=mapped_raw,
        filter=ruffus.regex(r"data/bam/(.*).Aligned.out.bam"),
        output=(r"output/mark_duplicates_and_sort/\1.deduped.bam"))

    # Split'N'Trim and reassign mapping qualities
    split_and_trimmed = main_pipeline.transform(
        name='split_trim',
        task_func=functions.generate_job_function(
            job_script='src/sh/split_trim',
            job_name='split_trim',
            job_type='transform',
            cpus_per_task=2),
        input=deduped,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.formatter(
            "output/mark_duplicates_and_sort/(?P<LIB>.+).deduped.bam"),
        output=["{subdir[0][1]}/split_trim/{LIB[0]}.split.bam"])\
        .follows(fa_idx)

    # we're going to recycle call_variants, merge_variants, filter_variants
    # and analyze_covar so we'll get the functions in advance
    call_variants = functions.generate_queue_job_function(
        job_script='src/sh/call_variants',
        job_name='call_variants')
    merge_variants = functions.generate_job_function(
        job_script='src/sh/merge_variants',
        job_name='merge_variants',
        job_type='transform',
        cpus_per_task=8)
    filter_variants = functions.generate_job_function(
        job_script='src/sh/filter_variants',
        job_name='filter_variants',
        job_type='transform',
        cpus_per_task=1)
    analyze_covar = functions.generate_queue_job_function(
        job_script='src/sh/analyze_covar',
        job_name='analyze_covar')

    # call variants without recalibration tables
    uncalibrated_variants = main_pipeline.transform(
        name='uncalibrated_variants',
        task_func=call_variants,
        input=split_and_trimmed,
        add_inputs=ruffus.add_inputs([ref_fa, annot_bed]),
        filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'),
        output='{subdir[0][1]}/variants_uncalibrated/{LIB[0]}.g.vcf.gz')

    # merge gVCF variants
    uncalibrated_variants_merged = main_pipeline.merge(
        name='uncalibrated_variants_merged',
        task_func=merge_variants,
        input=[uncalibrated_variants, ref_fa],
        output='output/variants_uncalibrated/variants_uncalibrated.vcf.gz')

    # filter variants on un-corrected bamfiles
    uncalibrated_variants_filtered = main_pipeline.transform(
        name='uncalibrated_variants_filtered',
        task_func=filter_variants,
        input=uncalibrated_variants_merged,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('_uncalibrated.vcf.gz'),
        output='_uncalibrated_filtered.vcf.gz')

    # select variant (only recalibrate using passed SNPs)
    uncalibrated_variants_selected = main_pipeline.transform(
        name='uncalibrated_variants_selected',
        task_func=functions.generate_job_function(
            job_script='src/sh/select_variants',
            job_name='select_variants',
            job_type='transform'),
        input=uncalibrated_variants_filtered,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('_uncalibrated_filtered.vcf.gz'),
        output='_uncalibrated_selected.vcf.gz')

    # create recalibration report with filtered variants
    covar_report = main_pipeline.merge(
        name='covar_report',
        task_func=analyze_covar,
        input=[split_and_trimmed, ref_fa, annot_bed,
               uncalibrated_variants_selected],
        output="output/covar_analysis/recal_data.table")

    # second pass to analyze covariation remaining after recalibration
    second_pass_covar_report = main_pipeline.merge(
        name='second_pass_covar_report',
        task_func=analyze_covar,
        input=[split_and_trimmed, ref_fa, annot_bed,
               uncalibrated_variants_filtered, covar_report],
        output="output/covar_analysis/post_recal_data.table")

    # plot effect of base recalibration
    recal_plot = main_pipeline.transform(
        name='recal_plot',
        task_func=functions.generate_job_function(
            job_script='src/R/recal_plot.R',
            job_name='recal_plot',
            job_type='transform',
            cpus_per_task=1),
        input=second_pass_covar_report,
        filter=ruffus.suffix('post_recal_data.table'),
        add_inputs=ruffus.add_inputs(covar_report),
        output='recalibration_plots.pdf')

    # recalibrate bases using recalibration report
    recalibrated = main_pipeline.transform(
        name='recalibrate',
        task_func=functions.generate_job_function(
            job_script='src/sh/recalibrate',
            job_name='recalibrate',
            job_type='transform',
            cpus_per_task=2),
        input=split_and_trimmed,
        add_inputs=ruffus.add_inputs([ref_fa, covar_report]),
        filter=ruffus.formatter('output/split_trim/(?P<LIB>.+).split.bam'),
        output='{subdir[0][1]}/recal/{LIB[0]}.recal.bam')

    # final variant calling
    variants = main_pipeline.transform(
        name='variants',
        task_func=call_variants,
        input=recalibrated,
        add_inputs=ruffus.add_inputs(ref_fa, annot_bed),
        filter=ruffus.formatter('output/recal/(?P<LIB>.+).recal.bam'),
        output='{subdir[0][1]}/variants/{LIB[0]}.g.vcf.gz')

    # merge gVCF variants
    variants_merged = main_pipeline.merge(
        name='variants_merged',
        task_func=merge_variants,
        input=[variants, ref_fa],
        output='output/variants/variants.vcf.gz')

    # variant filtering
    variants_filtered = main_pipeline.transform(
        name='variants_filtered',
        task_func=filter_variants,
        input=variants_merged,
        add_inputs=ruffus.add_inputs(ref_fa),
        filter=ruffus.suffix('.vcf.gz'),
        output='_filtered.vcf.gz')

    # variants by species
    split_variants = main_pipeline.subdivide(
        name='split_variants',
        task_func=functions.generate_job_function(
            job_script='src/sh/split_variants',
            job_name='split_variants',
            job_type='transform',
            cpus_per_task=1,
            ntasks=len(species_short_names)),
        input=variants_filtered,
        filter=ruffus.formatter(),
        add_inputs=ruffus.add_inputs(ref_fa),
        output=[('output/split_variants/' + x + '.variants_filtered.vcf.gz')
                for x in species_short_names])

    # count variants per gene per species
    cds_variants = main_pipeline.transform(
        name='cds_variants',
        task_func=functions.generate_job_function(
            job_script='src/R/cds_variants.R',
            job_name='cds_variants',
            job_type='transform'),
        input=split_variants,
        add_inputs=ruffus.add_inputs([ref_fa, annot]),
        filter=ruffus.formatter(
            'output/split_variants/(?P<LIB>.+).variants_filtered.vcf.gz'),
        output='{subdir[0][1]}/cds_variants/{LIB[0]}.cds_variants.Rds')

    # merge counted variants
    variants_per_gene = main_pipeline.merge(
        name='cds_merge',
        task_func=functions.generate_job_function(
            job_script='src/R/cds_merge.R',
            job_name='cds_merge',
            job_type='transform'),
        input=cds_variants,
        output='output/cds_variants/cds_variants.Rds')

    ###################
    # RUFFUS COMMANDS #
    ###################

    # print the flowchart
    ruffus.pipeline_printout_graph(
        "ruffus/flowchart.pdf", "pdf",
        pipeline_name="5 accessions variant calling pipeline")

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)
def main():

    #########
    # SETUP #
    #########

    # catch jgi logon and password from cli
    parser = ruffus.cmdline.get_argparse(
        description='5 accessions variant calling pipeline.')
    parser.add_argument('--email', '-e',
                        help='Logon email address for JGI',
                        type=str,
                        dest='jgi_logon')
    parser.add_argument('--password', '-p',
                        help='JGI password',
                        type=str,
                        dest='jgi_password')
    options = parser.parse_args()
    jgi_logon = options.jgi_logon
    jgi_password = options.jgi_password

    ##################
    # PIPELINE STEPS #
    ##################

    # initialise pipeline
    main_pipeline = ruffus.Pipeline.pipelines["main"]

    # test originate job
    test_originate_files = ['ruffus/foo.txt', 'ruffus/bar.txt']
    test_originate = main_pipeline.originate(
        name='test_originate',
        task_func=functions.generate_job_function(
            job_script='src/test_originate',
            job_name='test_originate',
            job_type='originate'),
        output=test_originate_files)

    # test download job
    if not (jgi_logon and jgi_password):
        raise ValueError('Supply jgi_logon and jgi_password')
    test_download = main_pipeline.originate(
        name='test_download',
        task_func=functions.generate_job_function(
            job_script='src/test_download',
            job_name='test_download',
            job_type='download'),
        output='ruffus/download.txt',
        extras=[jgi_logon, jgi_password])

    # test transform with multiple outputs (e.g. bamfile, FASTA etc)
    test_transform = main_pipeline.transform(
        name="test_transform",
        task_func=functions.generate_job_function(
            job_script='src/test_transform',
            job_name='test_transform',
            job_type='transform'),
        input=test_originate,
        filter=ruffus.suffix(".txt"),
        output=["_transformed.txt", "_transformed.bam"])

    # Transform ONLY the bam files produced by test_transform

    # The filtering here is a bit crazy. `input` has to be an object, not
    # ruffus.output_from(). `replace_inputs` should use `ruffus.inputs()` to
    # match the files, but `filter` has to match the first file produced by
    # the previous step, NOT necessarily the file that will be transformed!
    test_selective_transform = main_pipeline.transform(
        name="test_selective_transform",
        task_func=functions.generate_job_function(
            job_script='src/test_selective_transform',
            job_name='test_selective_transform',
            job_type='transform'),
        input=test_transform,
        replace_inputs=ruffus.inputs(r"\1.bam"),
        filter=ruffus.suffix(".txt"),
        output=".bof")

    test_merge = main_pipeline.merge(
        name='test_merge',
        task_func=functions.generate_job_function(
            job_script='src/test_merge',
            job_name='test_merge',
            job_type='merge'),
        input=test_transform,
        output='ruffus/foobar_merge.txt'
        )

    ###################
    # RUFFUS COMMANDS #
    ###################

    # print the flowchart
    ruffus.pipeline_printout_graph(
        "ruffus/flowchart.pdf", "pdf",
        pipeline_name="Ruffus proforma pipeline")

    # run the pipeline
    ruffus.cmdline.run(options, multithread=8)
Exemple #18
0
        traceback.print_exc()

@ruffus.follows(*[kronos_component_docker_TASK_A_function, ])
def __last_task___function():
    pass

#================================================================================
#main body
#--------------------------------------------------------------------------------
try:
    if not args.print_only:
        ruffus.pipeline_run(__last_task___function, multithread=args.num_jobs, verbose=0)
    else:
        cwd = os.getcwd()
        os.chdir(rm.pipeline_dir)
        ruffus.pipeline_printout_graph(args.pipeline_name + '.' + args.extension, args.extension, [__last_task___function], draw_vertically = args.draw_vertically, no_key_legend = args.no_key_legend, user_colour_scheme = {'colour_scheme_index': 1, 'Pipeline': {'fontcolor': '"#FF3232"'}, 'Task to run': {'linecolor': '"#0044A0"'}, 'Key': {'fontcolor': 'Red', 'fillcolor': '"#F6F4F4"'}, 'Final target': {'fontcolor': 'black', 'fillcolor': '"#EFA03B"', 'dashed': 0}})
        os.chdir(cwd)

    lrc = flushqueue(job_rcs)
    if all(rc == 0 for rc in lrc):
        EXIT_CODE = 0
    else:
        EXIT_CODE = 98

except:
    exc_type, exc_obj, exc_tb = sys.exc_info()
    ##exception object is of type <class 'ruffus.ruffus_exceptions.RethrownJobError'>.
    ##exc_obj.args[0][3] gives the message in the original exception.
    if exc_obj.args[0][3] == '(breakpoint)':
        print 'breakpoint happened in %s pipeline' % (args.pipeline_name)
        ljm.kill_all()
Exemple #19
0
def main():

    # parse CLI
    parser = ruffus.cmdline.get_argparse(
        description='Vv mtDNA assembly pipeline')
    parser.add_argument('--email',
                        '-e',
                        help='Email address, reported to NCBI',
                        type=str,
                        dest='email')

    options = parser.parse_args()

    # store the email variable for logon
    if options.email:
        os.environ['NCBI_EMAIL'] = options.email

    # initialise pipeline
    main_pipeline = ruffus.Pipeline.pipelines['main']

    # TEST FUNCTION
    test_job_function = tompltools.generate_job_function(
        job_script='src/sh/io_parser', job_name='test', verbose=True)

    # download COI seed file
    download_coi_fasta = main_pipeline.originate(
        name='download_coi_fasta.py',
        task_func=tompltools.generate_job_function(
            job_type='originate',
            job_script='src/py/download_coi_fasta.py',
            job_name='download_coi_fasta.py'),
        output='data/GU207861.1.fasta')

    # define files
    sample_list = 'data/samples.txt'

    with open(sample_list, 'r') as f:
        csvreader = csv.reader(f)
        next(csvreader)
        file_list = {x[0]: [x[1], x[2]] for x in csvreader}

    pe_filenames = file_list['pe']
    mp_filenames = file_list['mp']

    pe_files = find_all(pe_filenames, 'data')

    # filter out weird hidden directories, what are these anyway?
    pe_files_filtered = [x for x in pe_files if '/.' not in x]

    # load files into ruffus
    raw_fq_files = main_pipeline.originate(name='raw_fq_files',
                                           task_func=os.path.isfile,
                                           output=pe_files_filtered)

    # trim adaptors
    trim_bbduk = main_pipeline.merge(
        name='trim_bbduk',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/trim_bbduk',
            job_name='trim_bbduk',
            cpus_per_task=8),
        input=raw_fq_files,
        output='output/trim_bbduk/pe_trimmed.fastq.gz')

    # subsample
    # something like ['bof' + str(i) for i in range(1,4)]
    number_of_repeats = 5
    subsample_reads = main_pipeline.subdivide(
        name='subsample_reads',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/subsample_reads',
            job_name='subsample_reads',
            ntasks=number_of_repeats),
        input=trim_bbduk,
        filter=ruffus.formatter(),
        output=([
            'output/subsample_reads/pe_trimmed_subsampled_' + str(i) +
            '.fastq.gz' for i in range(1, number_of_repeats + 1)
        ]))

    # run mitobim
    mitobim_quick = main_pipeline.transform(
        name='run_mitobim',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/run_mitobim', job_name='run_mitobim'),
        input=subsample_reads,
        add_inputs=ruffus.add_inputs(download_coi_fasta),
        filter=ruffus.formatter(
            r'output/subsample_reads/pe_trimmed_subsampled_'
            '(?P<RN>\d).fastq.gz'),
        output='output/mitobim_quick_{RN[0]}/mitobim.log.txt')

    # re-fish with longest assembly
    find_longest_assembly = main_pipeline.originate(
        name='find_longest_assembly',
        task_func=tompltools.generate_job_function(
            job_type='originate',
            job_script='src/py/find_longest_assembly.py',
            job_name='find_longest_assembly'),
        output='output/longest_quick_scaffold.fasta')\
        .follows(mitobim_quick)

    mitobim_full = main_pipeline.transform(
        name='mitobim_full',
        task_func=tompltools.generate_job_function(
            job_script='src/sh/run_mitobim', job_name='run_mitobim'),
        input=trim_bbduk,
        add_inputs=ruffus.add_inputs(find_longest_assembly),
        filter=ruffus.formatter(),
        output='output/mitobim_full/mitobim.log.txt')

    ###################
    # RUFFUS COMMANDS #
    ###################

    # print the flowchart
    ruffus.pipeline_printout_graph("ruffus/flowchart.pdf",
                                   "pdf",
                                   pipeline_name="Vv mtDNA assembly pipeline")

    # run the pipeline
    ruffus.cmdline.run(options, multithread=32)
Exemple #20
0
def run_report(clean=True,
               with_pipeline_status=True,
               pipeline_status_format="svg"):
    '''run cgatreport.

    This will also run ruffus to create an svg image of the pipeline
    status unless *with_pipeline_status* is set to False. The image
    will be saved into the export directory.

    '''

    params = P.get_params()

    if with_pipeline_status:
        targetdir = params["exportdir"]
        if not os.path.exists(targetdir):
            os.mkdir(targetdir)

        ruffus.pipeline_printout_graph(
            os.path.join(targetdir, "pipeline.%s" % pipeline_status_format),
            pipeline_status_format, ["full"],
            checksum_level=params["ruffus_checksums_level"])

    dirname, basename = os.path.split(P.get_caller().__file__)

    report_engine = params.get("report_engine", "cgatreport")
    assert report_engine in ('sphinxreport', 'cgatreport')

    docdir = os.path.join(dirname, "pipeline_docs",
                          iotools.snip(basename, ".py"))
    themedir = os.path.join(dirname, "pipeline_docs", "themes")
    relpath = os.path.relpath(docdir)
    trackerdir = os.path.join(docdir, "trackers")

    # use a fake X display in order to avoid windows popping up
    # from R plots.
    xvfb_command = iotools.which("xvfb-run")

    # permit multiple servers using -d option
    if xvfb_command:
        xvfb_command += " -d "
    else:
        xvfb_command = ""

    # if there is no DISPLAY variable set, xvfb runs, but
    # exits with error when killing process. Thus, ignore return
    # value.
    # print os.getenv("DISPLAY"), "command=", xvfb_command
    if not os.getenv("DISPLAY"):
        erase_return = "|| true"
    else:
        erase_return = ""

    if os.path.exists("conf.py"):
        conf_dir = os.path.abspath(".")
    else:
        conf_dir = os.path.join(os.path.dirname(__file__), "configuration")

    # in the current version, xvfb always returns with an error, thus
    # ignore these.
    erase_return = "|| true"

    if clean:
        clean = "rm -rf report _cache _static;"
    else:
        clean = ""

    # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as
    # the virtual environment seems to be stripped. It is thus set to
    # the contents of the current sys.path
    syspath = ":".join(sys.path)

    statement = '''
    %(clean)s
    (export SPHINX_DOCSDIR=%(docdir)s;
    export SPHINX_THEMEDIR=%(themedir)s;
    export PYTHONPATH=%(syspath)s;
    %(xvfb_command)s
    %(report_engine)s-build
    --num-jobs=%(report_threads)s
    sphinx-build
    -b html
    -d %(report_doctrees)s
    -c %(conf_dir)s
    -j %(report_threads)s
    %(docdir)s %(report_html)s
    >& report.log %(erase_return)s )
    '''

    P.run(statement)

    E.info(
        'the report is available at %s' %
        os.path.abspath(os.path.join(params['report_html'], "contents.html")))
Exemple #21
0
#                                   tumour_pileup_gz=inputs[1],
#                                   output_prefix=output_prefix))
#     execute(cmd1, flag=flag1)
#     cmd2 = 'bgzip -f {0} && tabix -p vcf {1}'.format(
#         snp_vcf.replace('.gz', ''), snp_vcf)
#     execute(cmd2, flag=flag2)
#     cmd3 = 'bgzip -f {0} && tabix -p vcf {1}'.format(
#         indel_vcf.replace('.gz', ''), indel_vcf)
#     execute(cmd3, flag=flag3)


if __name__ == "__main__":
    for _ in R.pipeline_get_task_names ():
        print _
    print os.environ['PWD']
    print '=' * 79

    # parser = R.cmdline.get_argparse(
    #     description="krvarscan",
    #     usage='require python-2.7.x',
    #     version='0.1')
    # options = parser.parse_args()
    logger, logger_mutex = R.cmdline.setup_logging(
        __name__, OPTIONS.log_file, OPTIONS.verbose)

    logger.info(OPTIONS.verbose)
    # with logger_mutex:
    #     logger.info("Look Ma. No hands")
    R.pipeline_printout_graph('lele.svg', draw_vertically=True)
    R.cmdline.run(OPTIONS)
                # Running the task
                job.execute(curr_options, out_dir=out_dir)

            # Setting the attribute for the new function so that it can be
            # pickled
            setattr(__main__, func_name, curr_step)

            # Updating the in_job and the last suffix only if the tool produces
            # usable data
            if job.produce_usable_data():
                in_job = curr_step
                last_suffix += ".{}".format(job.get_suffix())

            # Adding the current job to the pipeline
            job_order.append(curr_step)

        # Printing the pipeline
        print("Running the pipeline...")
        pipeline_printout_graph("flowchart.{}".format(args.flowchart_format),
                                args.flowchart_format, job_order)
        pipeline_run(job_order,
                     verbose=0,
                     multiprocess=args.nb_process,
                     checksum_level=1)

    except KeyboardInterrupt:
        print("Cancelled by user", sys.stderr)
        sys.exit(0)
    except ProgramError as e:
        parser.error(e.message)
Exemple #23
0
def main():

    args = get_cmdline_args()

    # We want to look for modules in the directory local to the pipeline,
    # just as if the pipeline script had been called directly.
    # This includes the script itself and the config files imported by getOptions
    sys.path.insert(0, os.path.dirname(args.pipeline))

    # options must be set before pipeline is imported
    options = getOptions(args)
    setOptions(options)

    # import the pipeline so its stages are defined
    # the name of the pipeline is given on the command line
    __import__(drop_py_suffix(args.pipeline))

    logDir = options.pipeline['logDir']
    startLogger()
    pipelineOptions = options.pipeline
    endTasks = pipelineOptions['end']
    forcedTasks = pipelineOptions['force']
    style = pipelineOptions['style']
    if pipelineOptions['rebuild'] == 'fromstart':
        rebuildMode = True
    elif pipelineOptions['rebuild'] == 'fromend':
        rebuildMode = False
    else:
        rebuildMode = True
    if style in ['run', 'touchfiles']:
        touchfiles_flag = (style=='touchfiles')
        # Perform the pipeline steps (run the pipeline).
        pipeline_run(
            # End points of the pipeline.
            endTasks,
            # How many ruffus tasks to run.
            multiprocess=pipelineOptions['procs'],
            logger=black_hole_logger,
            # Force the pipeline to start from here, regardless of whether the
            # stage is up-to-date or not.
            forcedtorun_tasks=forcedTasks,
            # If the style was touchfiles, we will set a flag to bring 
            # files up to date without running anything
            touch_files_only=touchfiles_flag,
            # Choose the mode in which ruffus decides how much work needs to be
            # done.
            gnu_make_maximal_rebuild_mode=rebuildMode)
    elif style == 'flowchart':
        # Draw the pipeline as a diagram.
        pipeline_printout_graph(
            'flowchart.svg',
            'svg',
            endTasks,
            no_key_legend=False)
    elif style == 'print':
        # Print a textual description of what the piplines would do,
        #but don't actuall run it.
        pipeline_printout(
            sys.stdout,
            endTasks,
            verbose=5,
            wrap_width=100000,
            forcedtorun_tasks=forcedTasks,
            gnu_make_maximal_rebuild_mode=rebuildMode)