def main(config_file, fc_dir, run_info_yaml=None):
    config = load_config(config_file)
    work_dir = os.getcwd()
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(work_dir, "log")
    setup_logging(config)
    run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
Beispiel #2
0
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    align_dir = os.path.join(work_dir, "alignments")
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    ## process each flowcell lane
    #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    #lane_items = run_parallel("process_lane", lanes)

    logger.info (">>> Parse lane")
    lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config)
    
    #for item in lane_items:
        #utils.prettyprint_dict(item)

    logger.info (">>> Process alignment")
    align_items = run_parallel("process_alignment", lane_items)

    ## process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    logger.info (">>> Merge samples")
    samples = run_parallel("merge_sample", samples)
    logger.info (">>> Recalibrate samples")
    samples = run_parallel("recalibrate_sample", samples)
    logger.info (">>> realign sample")
    samples = parallel_realign_sample(samples, run_parallel)
    logger.info (">>> variantcall")
    samples = parallel_variantcall(samples, run_parallel)
    logger.info (">>> postprocess_variatns")
    samples = run_parallel("postprocess_variants", samples)
    logger.info (">>> combine_multiple_calles")
    samples = combine_multiple_callers(samples)
    logger.info (">>> detect_sv")
    samples = run_parallel("detect_sv", samples)
    logger.info (">>> combine_calls")
    samples = run_parallel("combine_calls", samples)
    logger.info (">>> process_sample")
    run_parallel("process_sample", samples)
    logger.info (">>> Generate bigwig")
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    logger.info (">>> Writing project summary")
    write_project_summary(samples)
    logger.info (">>> Writing metrics")
    write_metrics(run_info, fc_name, fc_date, dirs)
    logger.info (">>> Done")
def main(config_file, fc_dir, run_info_yaml=None):
    config = load_config(config_file)
    work_dir = os.getcwd()
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(work_dir, "log")
    setup_logging(config)
    run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
Beispiel #4
0
def run_main(config, config_file, work_dir, parallel,
         fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """

    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"],
                                           dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items):
            assert len(xs) == 1
            upload.from_sample(xs[0])
    write_metrics(run_info, fc_name, fc_date, dirs)
def main(config_file, fc_dir, run_info_yaml=None, num_cores=None):
    config = load_config(config_file)
    work_dir = os.getcwd()
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(work_dir, "log")
    if num_cores:
        config["algorithm"]["num_cores"] = int(num_cores)
    setup_logging(config)
    run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
Beispiel #6
0
def main(config_file, fc_dir, run_info_yaml=None, num_cores=None):
    config = load_config(config_file)
    work_dir = os.getcwd()
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(work_dir, "log")
    if num_cores:
        config["algorithm"]["num_cores"] = int(num_cores)
    setup_logging(config)
    run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
Beispiel #7
0
def main(config_file, fc_dir, run_info_yaml=None):
    config = load_config(config_file)
    if config.get("qcdb", None) is None:
        sys.exit()
    else:
        qcdb_config = config.get("qcdb", {})
    analysis = config.get("analysis", {})
    setup_logging(config)
    qcdb_store_dir = qcdb_config.get("qcdb_store_dir", None)
    run_main(fc_dir, qcdb_store_dir)
Beispiel #8
0
 def setUp(self):
     self.data_dir = os.path.join(os.path.dirname(__file__), "data", "automated")
     config_file = os.path.join(self.data_dir, "post_process-statusdb.yaml")
     config = load_config(config_file)
     setup_logging(config)
     fc_date = "110106"
     fc_name = "FC70BUKAAXX"
     run_info_yaml = os.path.join(self.data_dir, "run_info.yaml")
     workdir = os.path.join(os.path.dirname(__file__), "110106_FC70BUKAAXX")
     fc_dir = os.path.join(self.data_dir, os.pardir, "110106_FC70BUKAAXX")
Beispiel #9
0
 def setUp(self):
     self.data_dir = os.path.join(os.path.dirname(__file__), "data",
                                  "automated")
     config_file = os.path.join(self.data_dir, "post_process-statusdb.yaml")
     config = load_config(config_file)
     setup_logging(config)
     fc_date = "110106"
     fc_name = "FC70BUKAAXX"
     run_info_yaml = os.path.join(self.data_dir, "run_info.yaml")
     workdir = os.path.join(os.path.dirname(__file__), "110106_FC70BUKAAXX")
     fc_dir = os.path.join(self.data_dir, os.pardir, "110106_FC70BUKAAXX")
Beispiel #10
0
def runner(parallel, fn_name, items, work_dir, config):
    """Run a task on an ipython parallel cluster, allowing alternative queue types.

    This will spawn clusters for parallel and custom queue types like multicore
    and high I/O tasks on demand.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.
    """
    setup_logging(config)
    out = []
    checkpoint_dir = utils.safe_makedir(
        os.path.join(work_dir, "checkpoints_ipython"))
    checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % fn_name)
    fn = getattr(
        __import__("{base}.ipythontasks".format(base=parallel["module"]),
                   fromlist=["ipythontasks"]), fn_name)
    queue_type = _get_queue_type(fn)
    if queue_type:
        parallel = dictadd(parallel, "queue_type", queue_type)
    # already finished, run locally on current machine to collect details
    if os.path.exists(checkpoint_file):
        logger.info("ipython: %s -- local; checkpoint passed" % fn_name)
        for args in items:
            if args:
                data = fn(args)
                if data:
                    out.extend(data)
    # Run on a multicore queue with available cores on the same machine
    elif queue_type == "multicore":
        logger.info("ipython: %s -- multicore" % fn_name)
        with cluster_view(parallel) as view:
            for args in items:
                if args:
                    data = view.apply_sync(fn, args)
                    if data:
                        out.extend(data)
    # Run on a standard parallel queue
    else:
        logger.info("ipython: %s -- parallel" % fn_name)
        with cluster_view(parallel) as view:
            xs = [x for x in items if x is not None]
            if len(xs) > 0:
                for data in view.map_sync(fn, xs):
                    if data:
                        out.extend(data)
    with open(checkpoint_file, "w") as out_handle:
        out_handle.write("done\n")
    return out
def main(config_file, fc_dir, run_info_yaml=None):
    config = load_config(config_file)
    work_dir = os.getcwd()
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(work_dir, "log")

    def insert_command(record):
        record.extra["command"] = sys.argv
        record.extra["version"] = version.get_pipeline_version()

    setup_logging(config)
    handler = create_log_handler(config)
    with handler, logbook.Processor(insert_command):

        run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
Beispiel #12
0
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Beispiel #13
0
def runner(parallel, fn_name, items, work_dir, config):
    """Run a task on an ipython parallel cluster, allowing alternative queue types.

    This will spawn clusters for parallel and custom queue types like multicore
    and high I/O tasks on demand.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.
    """
    setup_logging(config)
    out = []
    checkpoint_dir = utils.safe_makedir(os.path.join(work_dir, "checkpoints_ipython"))
    checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % fn_name)
    fn = getattr(__import__("{base}.ipythontasks".format(base=parallel["module"]),
                            fromlist=["ipythontasks"]),
                 fn_name)
    queue_type = _get_queue_type(fn)
    if queue_type:
        parallel = dictadd(parallel, "queue_type", queue_type)
    # already finished, run locally on current machine to collect details
    if os.path.exists(checkpoint_file):
        logger.info("ipython: %s -- local; checkpoint passed" % fn_name)
        for args in items:
            if args:
                data = fn(args)
                if data:
                    out.extend(data)
    # Run on a multicore queue with available cores on the same machine
    elif queue_type == "multicore":
        logger.info("ipython: %s -- multicore" % fn_name)
        with cluster_view(parallel, config) as view:
            for args in items:
                if args:
                    data = view.apply_sync(fn, args)
                    if data:
                        out.extend(data)
    # Run on a standard parallel queue
    else:
        logger.info("ipython: %s -- parallel" % fn_name)
        with cluster_view(parallel, config) as view:
            xs = [x for x in items if x is not None]
            if len(xs) > 0:
                for data in view.map_sync(fn, xs):
                    if data:
                        out.extend(data)
    with open(checkpoint_file, "w") as out_handle:
        out_handle.write("done\n")
    return out
Beispiel #14
0
def main(config_file, fc_dir, run_info_yaml=None):
    config = load_config(config_file)
    work_dir = os.getcwd()
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(work_dir, "log")

    def insert_command(record):
        record.extra["command"] = sys.argv
        record.extra["version"] = version.get_pipeline_version()

    setup_logging(config)
    handler = create_log_handler(config)
    with handler, \
         logbook.Processor(insert_command):

        run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
Beispiel #15
0
def main(config_file, queues=None, task_module=None, base_dir=None):
    if base_dir is None:
        base_dir = os.getcwd()
    if task_module is None:
        task_module = "bcbio.distributed.tasks"
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(base_dir, "log")
    signals.setup_logging.connect(celery_logger(config))
    setup_logging(config)
    logger.info("Starting distributed worker process: {0}".format(queues if queues else ""))
    with utils.chdir(base_dir):
        with utils.curdir_tmpdir() as work_dir:
            dirs = {"work": work_dir, "config": os.path.dirname(config_file)}
            with create_celeryconfig(task_module, dirs, config,
                                     os.path.abspath(config_file)):
                run_celeryd(work_dir, queues)
Beispiel #16
0
def _setup_logging(args):
    if len(args) > 0:
        for check_i in [0, -1]:
            config = args[0][check_i]
            if isinstance(config, dict) and config.has_key("config"):
                config = config["config"]
                break
            elif isinstance(config, dict) and config.has_key("algorithm"):
                break
            else:
                config = None
        setup_logging(config)
    try:
        yield None
    except:
        logger.exception("Unexpected error")
        raise
Beispiel #17
0
def runner(parallel, fn_name, items, work_dir, config):
    """Run a task on an ipython parallel cluster, allowing alternative queue types.

    This will spawn clusters for parallel and custom queue types like multicore
    and high I/O tasks on demand.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.
    """
    setup_logging(config)
    out = []
    checkpoint_dir = utils.safe_makedir(
        os.path.join(work_dir, "checkpoints_ipython"))
    checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name)
    fn = getattr(
        __import__("{base}.ipythontasks".format(base=parallel["module"]),
                   fromlist=["ipythontasks"]), fn_name)
    items = [x for x in items if x is not None]
    num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config)
    parallel = dictadd(parallel, "cores_per_job", cores_per_job)
    parallel = dictadd(parallel, "num_jobs", num_jobs)
    # already finished, run locally on current machine to collect details
    if os.path.exists(checkpoint_file):
        logger.info("ipython: %s -- local; checkpoint passed" % fn_name)
        for args in items:
            if args:
                data = fn(args)
                if data:
                    out.extend(data)
    # Run on a standard parallel queue
    else:
        logger.info("ipython: %s" % fn_name)
        if len(items) > 0:
            items = [add_cores_to_config(x, cores_per_job) for x in items]
            with ipython_cluster.cluster_view(
                    parallel["scheduler"].lower(),
                    parallel["queue"],
                    parallel["num_jobs"],
                    parallel["cores_per_job"],
                    profile=parallel["profile"]) as view:
                for data in view.map_sync(fn, items, track=False):
                    if data:
                        out.extend(data)
    with open(checkpoint_file, "w") as out_handle:
        out_handle.write("done\n")
    return out
Beispiel #18
0
def _setup_logging(args):
    if len(args) > 0:
        for check_i in [0, -1]:
            config = args[0][check_i]
            if isinstance(config, dict) and config.has_key("config"):
                config = config["config"]
                break
            elif isinstance(config, dict) and config.has_key("algorithm"):
                break
            else:
                config = None
        setup_logging(config)
    try:
        yield None
    except:
        logger.exception("Unexpected error")
        raise
Beispiel #19
0
def main(config_file, queues=None, task_module=None, base_dir=None):
    if base_dir is None:
        base_dir = os.getcwd()
    if task_module is None:
        task_module = "bcbio.distributed.tasks"
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(base_dir, "log")
    signals.setup_logging.connect(celery_logger(config))
    setup_logging(config)
    logger.info("Starting distributed worker process: {0}".format(queues if queues else ""))
    with utils.chdir(base_dir):
        with utils.curdir_tmpdir() as work_dir:
            dirs = {"work": work_dir, "config": os.path.dirname(config_file)}
            with create_celeryconfig(task_module, dirs, config,
                                     os.path.abspath(config_file)):
                run_celeryd(work_dir, queues)
Beispiel #20
0
def runner(parallel, fn_name, items, work_dir, config):
    """Run a task on an ipython parallel cluster, allowing alternative queue types.

    This will spawn clusters for parallel and custom queue types like multicore
    and high I/O tasks on demand.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters
    for sections that have been previous processed.
    """
    setup_logging(config)
    out = []
    checkpoint_dir = utils.safe_makedir(os.path.join(work_dir, "checkpoints_ipython"))
    checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name)
    fn = getattr(__import__("{base}.ipythontasks".format(base=parallel["module"]),
                            fromlist=["ipythontasks"]),
                 fn_name)
    items = [x for x in items if x is not None]
    num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config)
    parallel = dictadd(parallel, "cores_per_job", cores_per_job)
    parallel = dictadd(parallel, "num_jobs", num_jobs)
    # already finished, run locally on current machine to collect details
    if os.path.exists(checkpoint_file):
        logger.info("ipython: %s -- local; checkpoint passed" % fn_name)
        for args in items:
            if args:
                data = fn(args)
                if data:
                    out.extend(data)
    # Run on a standard parallel queue
    else:
        logger.info("ipython: %s" % fn_name)
        if len(items) > 0:
            items = [add_cores_to_config(x, cores_per_job) for x in items]
            with ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"],
                                              parallel["num_jobs"], parallel["cores_per_job"],
                                              profile=parallel["profile"]) as view:
                for data in view.map_sync(fn, items, track=False):
                    if data:
                        out.extend(data)
    with open(checkpoint_file, "w") as out_handle:
        out_handle.write("done\n")
    return out
Beispiel #21
0
def _setup_logging(args):
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if ipython.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif ipython.is_std_config_arg(arg):
            config = arg
            break
    if config is not None:
        setup_logging(config)
    else:
        raise NotImplementedError("No config in %s:" % args[0])
    try:
        yield None
    except:
        logger.exception("Unexpected error")
        raise
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None, barcodes=None):
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(project_dir, "log")
    setup_logging(config)

    if project_desc is None and lanes is None:
        logger.error("No project description or lanes provided: cannot deliver files without this information")
        sys.exit()

    if options.customer_delivery and not fc_alias == "":
        logger.info("INFO: Ignoring flowcell_alias when doing customer_delivery")

    fc_dir = os.path.abspath(fc_dir)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fp = open(run_info_yaml)
    run_info_structure = yaml.load(fp)
    original_fc = PostProcessedFlowcell(fc_name, fc_date, run_info_structure, fc_dir=fc_dir, fc_results_dir=fc_dir)
    pruned_fc = original_fc.prune_to_project(project_desc, exclude_unmatched=True)
    if pruned_fc is None or len(pruned_fc.get_lanes()) == 0:
        if not project_desc is None:
            logger.error("No lanes found with matching description %s: please check your flowcell run information" % project_desc)
            print >> sys.stderr, "Available projects: \n\t%s" %  ("\n\t".join(original_fc.get_project_names()))
            sys.exit()
        if not lanes  is None:
            logger.error("No lanes found with numbers %s: please check your flowcell run information" % " ".join(lanes))
            sys.exit()
    # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data)
    rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias)
    rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/data", rawdata_fc.get_fc_id())))
    analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias)
    analysis_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/intermediate", rawdata_fc.get_fc_id())))

    # If customer delivery setup some special options
    if options.customer_delivery:
        rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, project_desc, rawdata_fc.get_fc_id())))
        rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id())
        analysis_fc = rawdata_fc
    _make_delivery_directory(rawdata_fc)
    _make_delivery_directory(analysis_fc)
    run_main(pruned_fc, rawdata_fc, analysis_fc)
Beispiel #23
0
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """

    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = lane.process_all_lanes(lanes, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        pipeline_items = _add_provenance(pipeline_items, dirs, config)
        for xs in pipeline.run(config, config_file, run_parallel, dirs,
                               pipeline_items):
            assert len(xs) == 1
            upload.from_sample(xs[0])
    qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
Beispiel #24
0
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Beispiel #25
0
 def _worker(**kwds):
     setup_logging(config)
def main(config_file, delivery_dir, run_info_yaml, analysis_dir=None):
    if analysis_dir is None:
        analysis_dir = os.path.abspath(os.path.curdir)
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(analysis_dir, "log")
    setup_logging(config)

    if not os.path.exists(analysis_dir):
        logger.error("No analysis directory found!")
        sys.exit()

    with open(run_info_yaml) as fp:
        run_info_structure = yaml.load(fp)

    lane2sample = dict()
    infiles = dict()
    for info in run_info_structure["details"]:
        lane2sample[info["lane"]] = info["description"]
        infiles[info["lane"]] = dict(vcf=[], bam=[], bigwig=[], metrics=[])
        # Vcf files, tsv and tranches
        vcftypes = ("*.vcf", "*.idx", "*.tranches", "*.eval", "*.tsv")
        for vcftype in vcftypes:
            glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + vcftype)
            infiles[info["lane"]]["vcf"].extend(glob.glob(glob_str))
        # Bam files
        glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + options.bam_glob)
        bamfiles = glob.glob(glob_str)
        infiles[info["lane"]]["bam"] = bamfiles
        # Bigwig files
        glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + "*.bigwig")
        bigwigfiles = glob.glob(glob_str)
        infiles[info["lane"]]["bigwig"] = bigwigfiles
        # metrics files
        glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + "*metrics")
        metricsfiles = glob.glob(glob_str)
        infiles[info["lane"]]["metrics"] = metricsfiles

    # snpEff files
    glob_str = os.path.join(analysis_dir, "snpEff*")
    snpeff_files = glob.glob(glob_str)

    # Loop through the list and deliver if appropriate
    _make_dir(delivery_dir)
    _deliver_file(os.path.join(analysis_dir, "project-summary.csv"), os.path.join(delivery_dir, "project-summary.csv"))
    _deliver_file(os.path.join(analysis_dir, "run_summary.yaml"), os.path.join(delivery_dir, "run_summary.yaml"))
    _deliver_file(run_info_yaml, os.path.join(delivery_dir, os.path.basename(run_info_yaml)))
    if not options.no_vcf:
        for sf in snpeff_files:
            _deliver_file(sf, os.path.join(delivery_dir, os.path.basename(sf)))
    for lane_num in infiles.keys():
        lane = infiles[lane_num]
        if not options.no_vcf:
            for vcf in lane["vcf"]:
                (src, tgt) = _rename_sample_file(vcf, lane_num, lane2sample[lane_num], delivery_dir)
                _deliver_file(src, tgt)
        if not options.no_bigwig:
            for bigwig in lane["bigwig"]:
                (src, tgt) = _rename_sample_file(bigwig, lane_num, lane2sample[lane_num], delivery_dir)
                _deliver_file(src, tgt)
        if not options.no_metrics:
            for metrics in lane["metrics"]:
                (src, tgt) = _rename_sample_file(metrics, lane_num, lane2sample[lane_num], delivery_dir)
                _deliver_file(src, tgt)
        if options.bam:
            for bamfile in lane["bam"]:
                (src, tgt) = _rename_sample_file(bamfile, lane_num, lane2sample[lane_num], delivery_dir)
                _deliver_file(src, tgt)
def main(config_file,
         fc_dir,
         project_dir,
         run_info_yaml=None,
         fc_alias=None,
         project_desc=None,
         lanes=None,
         barcodes=None):
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(project_dir, "log")
    setup_logging(config)

    if project_desc is None and lanes is None:
        logger.error(
            "No project description or lanes provided: cannot deliver files without this information"
        )
        sys.exit()

    if options.customer_delivery and not fc_alias == "":
        logger.info(
            "INFO: Ignoring flowcell_alias when doing customer_delivery")

    fc_dir = os.path.abspath(fc_dir)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fp = open(run_info_yaml)
    run_info_structure = yaml.load(fp)
    original_fc = PostProcessedFlowcell(fc_name,
                                        fc_date,
                                        run_info_structure,
                                        fc_dir=fc_dir,
                                        fc_results_dir=fc_dir)
    pruned_fc = original_fc.prune_to_project(project_desc,
                                             exclude_unmatched=True)
    if pruned_fc is None or len(pruned_fc.get_lanes()) == 0:
        if not project_desc is None:
            logger.error(
                "No lanes found with matching description %s: please check your flowcell run information"
                % project_desc)
            print >> sys.stderr, "Available projects: \n\t%s" % ("\n\t".join(
                original_fc.get_project_names()))
            sys.exit()
        if not lanes is None:
            logger.error(
                "No lanes found with numbers %s: please check your flowcell run information"
                % " ".join(lanes))
            sys.exit()
    # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data)
    rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(),
                                       pruned_fc.get_fc_date(),
                                       pruned_fc.to_structure()['details'],
                                       fc_alias=fc_alias)
    rawdata_fc.set_fc_dir(
        os.path.abspath(
            os.path.join(project_dir, "nobackup/data",
                         rawdata_fc.get_fc_id())))
    analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(),
                                        pruned_fc.get_fc_date(),
                                        pruned_fc.to_structure()['details'],
                                        fc_alias=fc_alias)
    analysis_fc.set_fc_dir(
        os.path.abspath(
            os.path.join(project_dir, "nobackup/intermediate",
                         rawdata_fc.get_fc_id())))

    # If customer delivery setup some special options
    if options.customer_delivery:
        rawdata_fc.set_fc_dir(
            os.path.abspath(
                os.path.join(project_dir, project_desc,
                             rawdata_fc.get_fc_id())))
        rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id())
        analysis_fc = rawdata_fc
    _make_delivery_directory(rawdata_fc)
    _make_delivery_directory(analysis_fc)
    run_main(pruned_fc, rawdata_fc, analysis_fc)
def main(config_file, delivery_dir, run_info_yaml, analysis_dir=None):
    if analysis_dir is None:
        analysis_dir = os.path.abspath(os.path.curdir)
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(analysis_dir, "log")
    setup_logging(config)

    if not os.path.exists(analysis_dir):
        logger.error("No analysis directory found!")
        sys.exit()

    with open(run_info_yaml) as fp:
        run_info_structure = yaml.load(fp)

    lane2sample = dict()
    infiles = dict()
    for info in run_info_structure['details']:
        lane2sample[info['lane']] = info['description']
        infiles[info['lane']] = dict(vcf=[], bam=[], bigwig=[], metrics=[])
        # Vcf files, tsv and tranches
        vcftypes = ('*.vcf', '*.idx', '*.tranches', '*.eval', '*.tsv')
        for vcftype in vcftypes:
            glob_str = os.path.join(analysis_dir,
                                    str(info['lane']) + "_" + vcftype)
            infiles[info['lane']]['vcf'].extend(glob.glob(glob_str))
        # Bam files
        glob_str = os.path.join(analysis_dir,
                                str(info['lane']) + "_" + options.bam_glob)
        bamfiles = glob.glob(glob_str)
        infiles[info['lane']]['bam'] = bamfiles
        # Bigwig files
        glob_str = os.path.join(analysis_dir,
                                str(info['lane']) + "_" + "*.bigwig")
        bigwigfiles = glob.glob(glob_str)
        infiles[info['lane']]['bigwig'] = bigwigfiles
        # metrics files
        glob_str = os.path.join(analysis_dir,
                                str(info['lane']) + "_" + "*metrics")
        metricsfiles = glob.glob(glob_str)
        infiles[info['lane']]['metrics'] = metricsfiles

    # snpEff files
    glob_str = os.path.join(analysis_dir, "snpEff*")
    snpeff_files = glob.glob(glob_str)

    # Loop through the list and deliver if appropriate
    _make_dir(delivery_dir)
    _deliver_file(os.path.join(analysis_dir, "project-summary.csv"),
                  os.path.join(delivery_dir, "project-summary.csv"))
    _deliver_file(os.path.join(analysis_dir, "run_summary.yaml"),
                  os.path.join(delivery_dir, "run_summary.yaml"))
    _deliver_file(run_info_yaml,
                  os.path.join(delivery_dir, os.path.basename(run_info_yaml)))
    if not options.no_vcf:
        for sf in snpeff_files:
            _deliver_file(sf, os.path.join(delivery_dir, os.path.basename(sf)))
    for lane_num in infiles.keys():
        lane = infiles[lane_num]
        if not options.no_vcf:
            for vcf in lane['vcf']:
                (src, tgt) = _rename_sample_file(vcf, lane_num,
                                                 lane2sample[lane_num],
                                                 delivery_dir)
                _deliver_file(src, tgt)
        if not options.no_bigwig:
            for bigwig in lane['bigwig']:
                (src, tgt) = _rename_sample_file(bigwig, lane_num,
                                                 lane2sample[lane_num],
                                                 delivery_dir)
                _deliver_file(src, tgt)
        if not options.no_metrics:
            for metrics in lane['metrics']:
                (src, tgt) = _rename_sample_file(metrics, lane_num,
                                                 lane2sample[lane_num],
                                                 delivery_dir)
                _deliver_file(src, tgt)
        if options.bam:
            for bamfile in lane['bam']:
                (src, tgt) = _rename_sample_file(bamfile, lane_num,
                                                 lane2sample[lane_num],
                                                 delivery_dir)
                _deliver_file(src, tgt)
Beispiel #29
0
 def _worker(**kwds):
     setup_logging(config)