예제 #1
0
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None):
    config = load_config(config_file)
    galaxy_api = (GalaxyApiAccess(config['galaxy_url'],
                                  config['galaxy_api_key'])
                  if config.has_key("galaxy_api_key") else None)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)

    base_folder_name = "%s_%s" % (fc_date, fc_name)
    run_details = lims_run_details(run_info, base_folder_name)
    for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name,
         fname_out) in run_details:
        library_id = (get_galaxy_library(library_name, galaxy_api)
                      if library_name else None)
        upload_files = list(
            select_upload_files(local_name, bc_id, fc_dir, analysis_dir,
                                config, fname_out))
        if len(upload_files) > 0:
            print lane, bc_id, name, desc, library_name
            print "Creating storage directory"
            if library_id:
                folder, cur_galaxy_files = get_galaxy_folder(
                    library_id, base_folder_name, name, desc, galaxy_api)
            else:
                cur_galaxy_files = []
            store_dir = move_to_storage(lane, bc_id, base_folder_name,
                                        upload_files, cur_galaxy_files, config,
                                        config_file, fname_out)
            if store_dir and library_id:
                print "Uploading directory of files to Galaxy"
                print galaxy_api.upload_directory(library_id, folder['id'],
                                                  store_dir, dbkey,
                                                  access_role)
    if galaxy_api and not run_info_yaml:
        add_run_summary_metrics(analysis_dir, galaxy_api)
예제 #2
0
파일: main.py 프로젝트: kevyin/bcbb
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    align_dir = os.path.join(work_dir, "alignments")
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    ## process each flowcell lane
    #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    #lane_items = run_parallel("process_lane", lanes)

    logger.info (">>> Parse lane")
    lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config)
    
    #for item in lane_items:
        #utils.prettyprint_dict(item)

    logger.info (">>> Process alignment")
    align_items = run_parallel("process_alignment", lane_items)

    ## process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    logger.info (">>> Merge samples")
    samples = run_parallel("merge_sample", samples)
    logger.info (">>> Recalibrate samples")
    samples = run_parallel("recalibrate_sample", samples)
    logger.info (">>> realign sample")
    samples = parallel_realign_sample(samples, run_parallel)
    logger.info (">>> variantcall")
    samples = parallel_variantcall(samples, run_parallel)
    logger.info (">>> postprocess_variatns")
    samples = run_parallel("postprocess_variants", samples)
    logger.info (">>> combine_multiple_calles")
    samples = combine_multiple_callers(samples)
    logger.info (">>> detect_sv")
    samples = run_parallel("detect_sv", samples)
    logger.info (">>> combine_calls")
    samples = run_parallel("combine_calls", samples)
    logger.info (">>> process_sample")
    run_parallel("process_sample", samples)
    logger.info (">>> Generate bigwig")
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    logger.info (">>> Writing project summary")
    write_project_summary(samples)
    logger.info (">>> Writing metrics")
    write_metrics(run_info, fc_name, fc_date, dirs)
    logger.info (">>> Done")
예제 #3
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):
    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "align": align_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir,
    }
    run_parallel = parallel_runner(run_module, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("recalibrate_sample", samples)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("process_sample", samples)
    samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
예제 #4
0
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
                  if config.has_key("galaxy_api_key") else None)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)

    base_folder_name = "%s_%s" % (fc_date, fc_name)
    run_details = lims_run_details(run_info, base_folder_name)
    for (library_name, access_role, dbkey, lane, bc_id, name, desc,
            local_name) in run_details:
        library_id = (get_galaxy_library(library_name, galaxy_api)
                      if library_name else None)
        upload_files = list(select_upload_files(local_name, bc_id, fc_dir,
                                                analysis_dir, config))
        if len(upload_files) > 0:
            print lane, bc_id, name, desc, library_name
            print "Creating storage directory"
            if library_id:
                folder, cur_galaxy_files = get_galaxy_folder(library_id,
                               base_folder_name, name, desc, galaxy_api)
            else:
                cur_galaxy_files = []
            store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files,
                                        cur_galaxy_files, config, config_file)
            if store_dir and library_id:
                print "Uploading directory of files to Galaxy"
                print galaxy_api.upload_directory(library_id, folder['id'],
                                                  store_dir, dbkey, access_role)
    if galaxy_api and not run_info_yaml:
        add_run_summary_metrics(analysis_dir, galaxy_api)
예제 #5
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):
    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir), config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "align": align_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    run_parallel = parallel_runner(run_module, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("recalibrate_sample", samples)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("process_sample", samples)
    samples = run_parallel("generate_bigwig", samples,
                           {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
예제 #6
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    work_dir = os.getcwd()
    align_dir = os.path.join(work_dir, "alignments")

    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir),
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)

    # process each flowcell lane
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = _run_parallel("process_lane", lanes, dirs, config, config_file)

    # upload the demultiplex counts to Google Docs
    create_bc_report_on_gdocs(fc_date, fc_name, work_dir, run_info, config)

    align_items = _run_parallel("process_alignment", lane_items, dirs, config,
                                config_file)

    # process samples, potentially multiplexed across multiple lanes
    sample_files, sample_fastq, sample_info = \
            organize_samples(dirs, fc_name, fc_date, run_items, align_items)
    samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file)
               for n, bam_files in sample_files)
    sample_items = _run_parallel("process_sample", samples, dirs, config, config_file)

    write_metrics(run_info, fc_name, fc_date, dirs)
예제 #7
0
def run_main(config, config_file, work_dir, parallel,
         fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """

    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"],
                                           dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items):
            assert len(xs) == 1
            upload.from_sample(xs[0])
    write_metrics(run_info, fc_name, fc_date, dirs)
예제 #8
0
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None):
    if project_desc is None and lanes is None:
        log.error("No project description or lanes provided: cannot deliver files without this information")
        sys.exit()

    config = load_config(config_file)
    ## Set log file in project output directory
    config.update(log_dir=os.path.join(project_dir, "log"))
    log_handler = create_log_handler(config, log.name)

    fc_dir = os.path.normpath(fc_dir)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    with log_handler.applicationbound():
        run_info = prune_run_info_by_description(run_info['details'], project_desc, lanes)
    if len(run_info) == 0:
        log.error("No lanes found with matching description %s: please check your flowcell run information" % project_desc)
        sys.exit()

    dirs = dict(fc_dir=fc_dir, project_dir=project_dir)
    fc_name, fc_date = get_flowcell_id(run_info, dirs['fc_dir'])
    config.update(fc_name = fc_name, fc_date = fc_date)
    config.update(fc_alias = "%s_%s" % (fc_date, fc_name) if not fc_alias else fc_alias)
    dirs.update(fc_delivery_dir = os.path.join(dirs['project_dir'], options.data_prefix, config['fc_alias'] ))
    dirs.update(data_delivery_dir = os.path.join(dirs['project_dir'], options.data_prefix, "%s_%s" %(fc_date, fc_name) ))
    with log_handler.applicationbound():
        config = _make_delivery_directory(dirs, config)
        _save_run_info(run_info, dirs['fc_delivery_dir'], run_exit=options.only_run_info)
        run_main(run_info, config, dirs)
예제 #9
0
def _get_cores_and_type(config, fc_dir, run_info_yaml,
                        numcores=None, paralleltype=None):
    """Return core and parallelization approach from combo of config and commandline.

    Prefers passed commandline parameters over pre-configured, defaulting
    to a local run on a single core.

    The preferred approach is to pass in values explicitly on the commandline
    and this helps maintain back compatibility.
    """
    config_cores = config["algorithm"].get("num_cores", None)
    if config_cores:
        try:
            config_cores = int(config_cores)
            if numcores is None:
                numcores = config_cores
        except ValueError:
            if paralleltype is None:
                paralleltype = config_cores
    if paralleltype is None:
        paralleltype = "local"

    if numcores is None:
        if config.get("distributed", {}).get("num_workers", "") == "all":
            cp = config["distributed"]["cluster_platform"]
            cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp])
            numcores = cluster.available_nodes(config["distributed"]["platform_args"]) - 1
        if numcores is None:
            if paralleltype == "local":
                numcores = 1
            else:
                numcores = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1])
    return paralleltype, int(numcores)
예제 #10
0
def main(config_file, fc_dir, run_info_yaml=None, num_workers=None):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    assert config["algorithm"]["num_cores"] == "messaging", \
           "Use this script only with configured 'messaging' parallelization"
    if num_workers is None:
        num_workers = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1])
    task_module = "bcbio.distributed.tasks"
    args = [config_file, fc_dir]
    if run_info_yaml:
        args.append(run_info_yaml)
    run_and_monitor(config, config_file, args, num_workers, task_module)
예제 #11
0
 def test_run_info_combine(self):
     """Combine multiple lanes in a test run into a single combined lane.
     """
     run_info_yaml = os.path.join(self.data_dir, "run_info-alternatives.yaml")
     _, _, run_info = get_run_info("", {}, run_info_yaml)
     assert len(run_info["details"]) == 2
     assert len(run_info["details"][0]) == 3
     x1, x2, x3 = run_info["details"][0]
     assert x1["description"] == "1: BC1"
     assert x2["description"] == "1: BC2"
     assert x3["genome_build"] == "mm9"
     x1 = run_info["details"][1][0]
     assert x1["barcode_id"] is None
예제 #12
0
 def test_run_info_combine(self):
     """Combine multiple lanes in a test run into a single combined lane.
     """
     run_info_yaml = os.path.join(self.data_dir, "run_info-alternatives.yaml")
     _, _, run_info = get_run_info("", {}, run_info_yaml)
     assert len(run_info["details"]) == 2
     assert len(run_info["details"][0]) == 3
     x1, x2, x3 = run_info["details"][0]
     assert x1["description"] == "1: BC1"
     assert x2["description"] == "1: BC2"
     assert x3["genome_build"] == "mm9"
     x1 = run_info["details"][1][0]
     assert x1["barcode_id"] is None
예제 #13
0
def main(config_file, fc_dir, run_info_yaml=None):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    assert config["algorithm"]["num_cores"] == "messaging", \
           "This script is used with configured 'messaging' parallelization"
    cluster = globals()[config["distributed"]["cluster_platform"]]
    workers_needed = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1])
    print "Starting cluster workers"
    jobids = start_workers(cluster, workers_needed, config, config_file)
    try:
        print "Running analysis"
        run_analysis(config_file, fc_dir, run_info_yaml, cluster, config)
    finally:
        print "Cleaning up cluster workers"
        stop_workers(cluster, jobids)
예제 #14
0
파일: main.py 프로젝트: jme9/wabio
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
예제 #15
0
def main(config_file, fc_dir, run_info_yaml=None, num_workers=None):
    config = load_config(config_file)
    assert config["algorithm"]["num_cores"] == "messaging", \
           "Use this script only with configured 'messaging' parallelization"
    if num_workers is None:
        if config["distributed"].get("num_workers", "") == "all":
            cp = config["distributed"]["cluster_platform"]
            cluster = __import__("bcbio.distributed.{0}".format(cp), fromlist=[cp])
            num_workers = cluster.available_nodes(config["distributed"]["platform_args"]) - 1
        if num_workers is None:
            num_workers = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1])
    task_module = "bcbio.distributed.tasks"
    args = [config_file, fc_dir]
    if run_info_yaml:
        args.append(run_info_yaml)
    run_and_monitor(config, config_file, args, num_workers, task_module)
예제 #16
0
def main(config_file, fc_dir, run_info_yaml=None):
    config = load_config(config_file)
    assert config["algorithm"]["num_cores"] == "messaging", \
           "This script is used with configured 'messaging' parallelization"
    cluster = globals()[config["distributed"]["cluster_platform"]]
    workers_needed = _needed_workers(get_run_info(fc_dir, config, run_info_yaml)[-1])
    jobids = []
    try:
        print "Starting manager"
        manager_id = start_analysis_manager(config_file, fc_dir, run_info_yaml,
                                            cluster, config)
        print "Starting cluster workers"
        jobids.extend(start_workers(cluster, workers_needed, config, config_file))
        jobids.append(manager_id)
        print "Running analysis"
        monitor_analysis(cluster, manager_id)
    finally:
        print "Cleaning up cluster workers"
        stop_workers(cluster, jobids)
def main(config_file, fc_dir, run_info_yaml=None, num_workers=None):
    config = load_config(config_file)
    assert config["algorithm"]["num_cores"] == "messaging", \
           "Use this script only with configured 'messaging' parallelization"
    if num_workers is None:
        if config["distributed"].get("num_workers", "") == "all":
            cp = config["distributed"]["cluster_platform"]
            cluster = __import__("bcbio.distributed.{0}".format(cp),
                                 fromlist=[cp])
            num_workers = cluster.available_nodes(
                config["distributed"]["platform_args"]) - 1
        if num_workers is None:
            num_workers = _needed_workers(
                get_run_info(fc_dir, config, run_info_yaml)[-1])
    task_module = "bcbio.distributed.tasks"
    args = [config_file, fc_dir]
    if run_info_yaml:
        args.append(run_info_yaml)
    run_and_monitor(config, config_file, args, num_workers, task_module)
예제 #18
0
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None, barcodes=None):
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(project_dir, "log")
    setup_logging(config)

    if project_desc is None and lanes is None:
        logger.error("No project description or lanes provided: cannot deliver files without this information")
        sys.exit()

    if options.customer_delivery and not fc_alias == "":
        logger.info("INFO: Ignoring flowcell_alias when doing customer_delivery")

    fc_dir = os.path.abspath(fc_dir)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fp = open(run_info_yaml)
    run_info_structure = yaml.load(fp)
    original_fc = PostProcessedFlowcell(fc_name, fc_date, run_info_structure, fc_dir=fc_dir, fc_results_dir=fc_dir)
    pruned_fc = original_fc.prune_to_project(project_desc, exclude_unmatched=True)
    if pruned_fc is None or len(pruned_fc.get_lanes()) == 0:
        if not project_desc is None:
            logger.error("No lanes found with matching description %s: please check your flowcell run information" % project_desc)
            print >> sys.stderr, "Available projects: \n\t%s" %  ("\n\t".join(original_fc.get_project_names()))
            sys.exit()
        if not lanes  is None:
            logger.error("No lanes found with numbers %s: please check your flowcell run information" % " ".join(lanes))
            sys.exit()
    # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data)
    rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias)
    rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/data", rawdata_fc.get_fc_id())))
    analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias)
    analysis_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/intermediate", rawdata_fc.get_fc_id())))

    # If customer delivery setup some special options
    if options.customer_delivery:
        rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, project_desc, rawdata_fc.get_fc_id())))
        rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id())
        analysis_fc = rawdata_fc
    _make_delivery_directory(rawdata_fc)
    _make_delivery_directory(analysis_fc)
    run_main(pruned_fc, rawdata_fc, analysis_fc)
예제 #19
0
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """

    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = lane.process_all_lanes(lanes, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        pipeline_items = _add_provenance(pipeline_items, dirs, config)
        for xs in pipeline.run(config, config_file, run_parallel, dirs,
                               pipeline_items):
            assert len(xs) == 1
            upload.from_sample(xs[0])
    qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
예제 #20
0
def _get_cores_and_type(config,
                        fc_dir,
                        run_info_yaml,
                        numcores=None,
                        paralleltype=None):
    """Return core and parallelization approach from combo of config and commandline.

    Prefers passed commandline parameters over pre-configured, defaulting
    to a local run on a single core.

    The preferred approach is to pass in values explicitly on the commandline
    and this helps maintain back compatibility.
    """
    config_cores = config["algorithm"].get("num_cores", None)
    if config_cores:
        try:
            config_cores = int(config_cores)
            if numcores is None:
                numcores = config_cores
        except ValueError:
            if paralleltype is None:
                paralleltype = config_cores
    if paralleltype is None:
        paralleltype = "local"

    if numcores is None:
        if config.get("distributed", {}).get("num_workers", "") == "all":
            cp = config["distributed"]["cluster_platform"]
            cluster = __import__("bcbio.distributed.{0}".format(cp),
                                 fromlist=[cp])
            numcores = cluster.available_nodes(
                config["distributed"]["platform_args"]) - 1
        if numcores is None:
            if paralleltype == "local":
                numcores = 1
            else:
                numcores = _needed_workers(
                    get_run_info(fc_dir, config, run_info_yaml)[-1])
    return paralleltype, int(numcores)
예제 #21
0
파일: main.py 프로젝트: dogstuff/bcbb
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
예제 #22
0
def run_main(config, config_file, fc_dir, run_info_yaml):
    # Working directory has to be identical to where (demultiplexed) fastq files are located
    fc_dir = os.path.normpath(fc_dir)
    work_dir = os.getcwd()
    align_dir = os.path.join(work_dir, "alignments")

    #(_, fastq_dir_label) = os.path.split(work_dir)
    #fastq_dir = os.path.join(project_dir, fastq_dir_label)

    #fc_name, fc_date = get_flowcell_info(fc_dir)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    #run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml)
    #fastq_dir, galaxy_dir, config_dir = _get_full_paths(fastq_dir, config, config_file)
    galaxy_dir, config_dir = _get_full_paths(config, config_file)

    config_file = os.path.join(config_dir, os.path.basename(config_file))
#    fastq = fastq_dir,
    dirs = dict(galaxy= galaxy_dir, 
                align = align_dir, work = work_dir,
                config = config_dir, flowcell = fc_dir,
                fc_dir = fc_dir
                )
    # Since demultiplexing is already done, just extract run_items
    run_items = run_info['details']
    lane_items = []
    for info in run_items:
        print info
        lane_items.extend(make_lane_items(info, fc_date, fc_name, dirs, config))

    _run_parallel("process_alignment", lane_items, dirs, config)
    
    # Process samples
    sample_files, sample_fastq, sample_info = \
                  organize_samples(dirs, fc_name, fc_date, run_items)
    samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file)
               for n, bam_files in sample_files)
    _run_parallel("process_sample", samples, dirs, config)
예제 #23
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):
    _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt"))
    prog = RecordProgress(work_dir)
    to_compress = set()
    prog.progress("analysis_start")

    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir),
                                                        config, config_file)

    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}

    run_parallel = parallel_runner(run_module, dirs, config, config_file)
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)

    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    _add_to_compress(to_compress, lane_items, 'lane_items')
    prog.dummy()
    prog.progress("process_lane")

    # Remove spiked in controls, contaminants etc.
    lane_items = run_parallel("remove_contaminants", lane_items)
    _add_to_compress(to_compress, lane_items, 'lane_items')
    prog.dummy()
    prog.progress("remove_contaminants")
    align_items = run_parallel("process_alignment", lane_items)
    _add_to_compress(to_compress, align_items, 'align_items')
    prog.dummy()
    prog.progress("process_alignment")

    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("merge_sample")
    samples = run_parallel("mark_duplicates_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("mark_duplicates_sample")
    run_parallel("screen_sample_contaminants", samples)
    prog.dummy()
    prog.progress("screen_sample_contaminants")
    samples = run_parallel("recalibrate_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("recalibrate_sample")
    samples = parallel_realign_sample(samples, run_parallel)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("realign_sample")
    samples = parallel_variantcall(samples, run_parallel)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("variantcall")
    samples = run_parallel("detect_sv", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("detect_sv")
    samples = run_parallel("process_sample", samples)
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("process_sample")
    samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    _add_to_compress(to_compress, samples, 'samples')
    prog.dummy()
    prog.progress("generate_bigwig")
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
    prog.dummy()
    prog.progress("write_metrics")

    # Compress all files in to_compress
    if config['algorithm'].get('compress_files', True):
        sizes = run_parallel("compress_files", [[[cf]] for cf in to_compress])
        before = sum([s[0] for s in sizes])
        after = sum([s[1] for s in sizes])
        logger.info("Space used by the files before compressing (in bytes): " \
                     + str(before))
        logger.info("Space used by the files after compressing (in bytes): " \
                     + str(after))
        logger.info("Saved space (in bytes): " + str(before - after))
예제 #24
0
def main(config_file,
         fc_dir,
         project_dir,
         run_info_yaml=None,
         fc_alias=None,
         project_desc=None,
         lanes=None,
         barcodes=None):
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(project_dir, "log")
    setup_logging(config)

    if project_desc is None and lanes is None:
        logger.error(
            "No project description or lanes provided: cannot deliver files without this information"
        )
        sys.exit()

    if options.customer_delivery and not fc_alias == "":
        logger.info(
            "INFO: Ignoring flowcell_alias when doing customer_delivery")

    fc_dir = os.path.abspath(fc_dir)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fp = open(run_info_yaml)
    run_info_structure = yaml.load(fp)
    original_fc = PostProcessedFlowcell(fc_name,
                                        fc_date,
                                        run_info_structure,
                                        fc_dir=fc_dir,
                                        fc_results_dir=fc_dir)
    pruned_fc = original_fc.prune_to_project(project_desc,
                                             exclude_unmatched=True)
    if pruned_fc is None or len(pruned_fc.get_lanes()) == 0:
        if not project_desc is None:
            logger.error(
                "No lanes found with matching description %s: please check your flowcell run information"
                % project_desc)
            print >> sys.stderr, "Available projects: \n\t%s" % ("\n\t".join(
                original_fc.get_project_names()))
            sys.exit()
        if not lanes is None:
            logger.error(
                "No lanes found with numbers %s: please check your flowcell run information"
                % " ".join(lanes))
            sys.exit()
    # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data)
    rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(),
                                       pruned_fc.get_fc_date(),
                                       pruned_fc.to_structure()['details'],
                                       fc_alias=fc_alias)
    rawdata_fc.set_fc_dir(
        os.path.abspath(
            os.path.join(project_dir, "nobackup/data",
                         rawdata_fc.get_fc_id())))
    analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(),
                                        pruned_fc.get_fc_date(),
                                        pruned_fc.to_structure()['details'],
                                        fc_alias=fc_alias)
    analysis_fc.set_fc_dir(
        os.path.abspath(
            os.path.join(project_dir, "nobackup/intermediate",
                         rawdata_fc.get_fc_id())))

    # If customer delivery setup some special options
    if options.customer_delivery:
        rawdata_fc.set_fc_dir(
            os.path.abspath(
                os.path.join(project_dir, project_desc,
                             rawdata_fc.get_fc_id())))
        rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id())
        analysis_fc = rawdata_fc
    _make_delivery_directory(rawdata_fc)
    _make_delivery_directory(analysis_fc)
    run_main(pruned_fc, rawdata_fc, analysis_fc)
예제 #25
0
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml):

    _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt"))
    prog = utils.RecordProgress(work_dir)
    to_compress = set()
    prog.progress("analysis_start")

    align_dir = os.path.join(work_dir, "alignments")
    run_module = "bcbio.distributed"
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir),
                                                        config, config_file)

    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}

    run_parallel = parallel_runner(run_module, dirs, config, config_file)
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)

    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    [to_compress.add(f) for f in lane_items[0][0:2]]
    prog.progress("process_lane")

    # upload the sequencing report to Google Docs
    # will skip this for now and rely on external mechanism for uploading this data
    #gdocs_indicator = os.path.join(work_dir, "gdocs_report_complete.txt")
    #if not os.path.exists(gdocs_indicator) \
    #and queue_report(fc_date, fc_name, os.path.abspath(run_info_yaml), dirs, config, config_file):
    #    utils.touch_file(gdocs_indicator)

    # Remove spiked in controls, contaminants etc.
    lane_items = run_parallel("remove_contaminants", lane_items)
    [to_compress.add(f) for f in lane_items[0][0:2]]
    prog.progress("remove_contaminants")
    align_items = run_parallel("process_alignment", lane_items)
    [to_compress.add(f) for f in align_items[0]['fastq']]
    prog.progress("process_alignment")

    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    to_compress.add(samples[0][0]['fastq1'])
    to_compress.add(samples[0][0]['fastq2'])
    prog.progress("merge_sample")
    samples = run_parallel("mark_duplicates_sample", samples)
    to_compress.add(samples[0][0]['fastq1'])
    to_compress.add(samples[0][0]['fastq2'])
    prog.progress("mark_duplicates_sample")
    run_parallel("screen_sample_contaminants", samples)
    prog.progress("screen_sample_contaminants")
    samples = run_parallel("recalibrate_sample", samples)
    prog.progress("recalibrate_sample")
    samples = parallel_realign_sample(samples, run_parallel)
    prog.progress("realign_sample")
    samples = parallel_variantcall(samples, run_parallel)
    prog.progress("variantcall")
    samples = run_parallel("detect_sv", samples)
    prog.progress("detect_sv")
    samples = run_parallel("process_sample", samples)
    prog.progress("process_sample")
    samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    prog.progress("generate_bigwig")
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
    prog.progress("write_metrics")
    # Write statusdb metrics
    # will skip this for now and rely on external mechanism for uploading this data
    #report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config)

    #Compress all files in to_compress
    if config['algorithm'].get('compress_files', True):
        (before, after) = utils.compress_files(to_compress)
        logger.info("Space used by the files before compressing (in bytes): " \
                     + str(before))
        logger.info("Space used by the files after compressing (in bytes): " \
                     + str(after))
        logger.info("Saved space (in bytes): " + str(before - after))