Ejemplo n.º 1
0
def do_analysis(args, dockerconf):
    """Run a full analysis on a local machine, utilizing multiple cores.
    """
    work_dir = os.getcwd()
    with open(args.sample_config) as in_handle:
        sample_config, dmounts = mounts.update_config(yaml.load(in_handle),
                                                      args.fcdir)
    dmounts += mounts.prepare_system(args.datadir, dockerconf["biodata_dir"])
    dmounts.append("%s:%s" % (work_dir, dockerconf["work_dir"]))
    system_config, system_mounts = _read_system_config(dockerconf,
                                                       args.systemconfig,
                                                       args.datadir)
    system_cfile = os.path.join(work_dir, "bcbio_system-forvm.yaml")
    sample_cfile = os.path.join(work_dir, "bcbio_sample-forvm.yaml")
    with open(system_cfile, "w") as out_handle:
        yaml.dump(system_config,
                  out_handle,
                  default_flow_style=False,
                  allow_unicode=False)
    with open(sample_cfile, "w") as out_handle:
        yaml.dump(sample_config,
                  out_handle,
                  default_flow_style=False,
                  allow_unicode=False)
    in_files = [
        os.path.join(dockerconf["work_dir"], os.path.basename(x))
        for x in [system_cfile, sample_cfile]
    ]
    log.setup_local_logging({"include_time": False})
    manage.run_bcbio_cmd(
        args.image, dmounts + system_mounts, in_files + [
            "--numcores",
            str(args.numcores),
            "--workdir=%s" % dockerconf["work_dir"]
        ])
Ejemplo n.º 2
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, multisample = _world_from_cwl(fnargs[1:], work_dir)
        argfile = os.path.join(work_dir, "cwl-%s-world.json" % args.name)
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        out = fn(fnargs)
    if argfile:
        with open(argfile, "w") as out_handle:
            if argfile.endswith(".json"):
                if multisample:
                    json.dump([_remove_work_dir(xs[0], work_dir + "/") for xs in out], out_handle)
                else:
                    json.dump(_remove_work_dir(out[0][0], work_dir + "/"), out_handle)
            else:
                yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
Ejemplo n.º 3
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_items = run_info.organize(dirs, config, run_info_yaml)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    lane_items = lane.process_all_lanes(run_items, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    final = []
    with utils.curdir_tmpdir() as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Ejemplo n.º 4
0
def run_main(workdir, config_file=None, fc_dir=None, run_info_yaml=None,
             parallel=None, workflow=None):
    """Run variant analysis, handling command line options.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    locale_to_use = utils.get_locale()
    os.environ["LC_ALL"] = locale_to_use
    os.environ["LC"] = locale_to_use
    os.environ["LANG"] = locale_to_use
    workdir = utils.safe_makedir(os.path.abspath(workdir))
    os.chdir(workdir)
    config, config_file = config_utils.load_system_config(config_file, workdir)
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    logger.info(f"System YAML configuration: {os.path.abspath(config_file)}.")
    logger.info(f"Locale set to {locale_to_use}.")
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(workdir, DEFAULT_LOG_DIR)
    if parallel["type"] in ["local", "clusterk"]:
        _setup_resources()
        _run_toplevel(config, config_file, workdir, parallel,
                      fc_dir, run_info_yaml)
    elif parallel["type"] == "ipython":
        assert parallel["scheduler"] is not None, "IPython parallel requires a specified scheduler (-s)"
        if parallel["scheduler"] != "sge":
            assert parallel["queue"] is not None, "IPython parallel requires a specified queue (-q)"
        elif not parallel["queue"]:
            parallel["queue"] = ""
        _run_toplevel(config, config_file, workdir, parallel,
                      fc_dir, run_info_yaml)
    else:
        raise ValueError("Unexpected type of parallel run: %s" % parallel["type"])
Ejemplo n.º 5
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_lanes_with_pipelines(samples)
    final = []
    with utils.curdir_tmpdir() as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Ejemplo n.º 6
0
def _run_toplevel(config,
                  config_file,
                  work_dir,
                  parallel,
                  fc_dir=None,
                  run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    logger.info("System YAML configuration: %s" % os.path.abspath(config_file))
    dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config)
    system.write_info(dirs, parallel, config)
    with tx_tmpdir(config if parallel.get("type") ==
                   "local" else None) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, samples in pipelines.items():
            for xs in pipeline(config, run_info_yaml, parallel, dirs, samples):
                pass
Ejemplo n.º 7
0
def run_main(config, config_file, work_dir, parallel,
         fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir)
                                                        if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"],
                                           dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = lane.process_all_lanes(lanes, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        pipeline_items = _add_provenance(pipeline_items, dirs, config)
        for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items):
            if len(xs) == 1:
                upload.from_sample(xs[0])
    qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
Ejemplo n.º 8
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs = _world_from_cwl(fnargs[1:], work_dir)
        argfile = os.path.join(work_dir, "cwl-%s-world.json" % args.name)
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        out = fn(fnargs)
    if argfile:
        with open(argfile, "w") as out_handle:
            if argfile.endswith(".json"):
                json.dump(_remove_work_dir(out[0][0], work_dir + "/"), out_handle)
            else:
                yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
Ejemplo n.º 9
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None, samples=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    samples -- Pre-processed samples, useful if run inside of docker containers.
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    if samples:
        dockerized = True
    else:
        dockerized = False
        samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_samples_with_pipelines(samples)
    final = []
    with tx_tmpdir(config) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config)
            if not dockerized:
                versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Ejemplo n.º 10
0
def _run_toplevel(config,
                  config_file,
                  work_dir,
                  parallel,
                  fc_dir=None,
                  run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_samples_with_pipelines(samples)
    final = []
    with tx_tmpdir(config) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel,
                                             config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs,
                                   pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Ejemplo n.º 11
0
 def __call__(self, in_files):
     setup_local_logging(self.config, self.config["parallel"])
     self._start_message(in_files)
     # first is human, second is mouse
     out_files = self._disambiguate(in_files[0], in_files[1])
     dis_files = self._disambiguate_out(in_files)
     if all(map(file_exists, dis_files)):
         [shutil.move(x[0], x[1]) for x in zip(dis_files, out_files)]
     self._end_message(in_files)
     return out_files
Ejemplo n.º 12
0
def check_and_postprocess(args):
    """Check for newly dumped sequencer output, post-processing and transferring.
    """
    with open(args.process_config) as in_handle:
        config = yaml.safe_load(in_handle)
    setup_local_logging()
    for dname in _find_unprocessed(config):
        runinfo = nglims.get_runinfo(config["galaxy_url"], config["galaxy_apikey"], dname)
        lane_details = nglims.flatten_lane_detail(runinfo)
        fcid_ss = samplesheet.from_flowcell(dname, lane_details)
        fastq_dir = demultiplex.run_bcl2fastq(dname, fcid_ss, config)
Ejemplo n.º 13
0
def check_and_postprocess(args):
    """Check for newly dumped sequencer output, post-processing and transferring.
    """
    with open(args.process_config) as in_handle:
        config = yaml.safe_load(in_handle)
    setup_local_logging(config)
    for dname in _find_unprocessed(config):
        lane_details = nglims.get_runinfo(config["galaxy_url"], config["galaxy_apikey"], dname,
                                          utils.get_in(config, ("process", "storedir")))
        fcid_ss = samplesheet.from_flowcell(dname, lane_details)
        _update_reported(config["msg_db"], dname)
        fastq_dir = demultiplex.run_bcl2fastq(dname, fcid_ss, config)
        bcbio_config, ready_fastq_dir = nglims.prep_samples_and_config(dname, lane_details, fastq_dir, config)
        transfer.copy_flowcell(dname, ready_fastq_dir, bcbio_config, config)
        _start_processing(dname, bcbio_config, config)
Ejemplo n.º 14
0
def check_and_postprocess(args):
    """Check for newly dumped sequencer output, post-processing and transferring.
    """
    with open(args.process_config) as in_handle:
        config = yaml.safe_load(in_handle)
    setup_local_logging(config)
    for dname in _find_unprocessed(config):
        lane_details = nglims.get_runinfo(config["galaxy_url"], config["galaxy_apikey"], dname,
                                          utils.get_in(config, ("process", "storedir")))
        fcid_ss = samplesheet.from_flowcell(dname, lane_details)
        _update_reported(config["msg_db"], dname)
        fastq_dir = demultiplex.run_bcl2fastq(dname, fcid_ss, config)
        bcbio_config, ready_fastq_dir = nglims.prep_samples_and_config(dname, lane_details, fastq_dir, config)
        transfer.copy_to_remote(dname, ready_fastq_dir, bcbio_config, config)
        _start_processing(dname, bcbio_config, config)
Ejemplo n.º 15
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError(
            "Did not find exposed function in bcbio.distributed.multitasks named '%s'"
            % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(
            args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys = _world_from_cwl(fnargs[1:], work_dir)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel, out_keys = None, []
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        try:
            out = fn(fnargs)
        except:
            logger.exception()
            raise
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys,
                               work_dir)
        except:
            logger.exception()
            raise
        if argfile.endswith(".json"):
            _write_wdl_outputs(argfile, out_keys)
Ejemplo n.º 16
0
def _setup_logging(args):
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg,
                        (list, tuple)) and config_utils.is_nested_config_arg(
                            arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
Ejemplo n.º 17
0
 def wrapper(*args, **kwargs):
     config = None
     for arg in args:
         if config_utils.is_std_config_arg(arg):
             config = arg
             break
         elif config_utils.is_nested_config_arg(arg):
             config = arg["config"]
         elif isinstance(
                 arg,
             (list, tuple)) and config_utils.is_nested_config_arg(arg[0]):
             config = arg[0]["config"]
             break
     assert config, "Could not find config dictionary in function arguments."
     if config.get("parallel", {}).get("log_queue") and not config.get(
             "parallel", {}).get("wrapper"):
         handler = setup_local_logging(config, config["parallel"])
     else:
         handler = None
     try:
         out = f(*args, **kwargs)
     finally:
         if handler and hasattr(handler, "close"):
             handler.close()
     return out
Ejemplo n.º 18
0
def _setup_logging(args):
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg,
                        (list, tuple)) and config_utils.is_nested_config_arg(
                            arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
Ejemplo n.º 19
0
def _setup_logging(args):
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
Ejemplo n.º 20
0
def _setup_logging(args):
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
Ejemplo n.º 21
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    with open(args.argfile) as in_handle:
        fnargs = yaml.safe_load(in_handle)
    fnargs = config_utils.merge_resources(fnargs)
    work_dir = os.path.dirname(args.argfile)
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        out = fn(fnargs)
    out_file = "%s-out%s" % os.path.splitext(args.argfile)
    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
Ejemplo n.º 22
0
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config)
    system.write_info(dirs, parallel, config)
    with tx_tmpdir(config) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, samples in pipelines.items():
            for xs in pipeline.run(config, run_info_yaml, parallel, dirs, samples):
                pass
Ejemplo n.º 23
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    locale_to_use = utils.get_locale()
    os.environ["LC_ALL"] = locale_to_use
    os.environ["LC"] = locale_to_use
    os.environ["LANG"] = locale_to_use
    setpath.prepend_bcbiopath()
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError(
            "Did not find exposed function in bcbio.distributed.multitasks named '%s'"
            % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(
            args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys, input_files = _world_from_cwl(
            args.name, fnargs[1:], work_dir)
        # Can remove this awkward Docker merge when we do not need custom GATK3 installs
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel, out_keys, input_files = None, {}, []
    with utils.chdir(work_dir):
        with contextlib.closing(
                log.setup_local_logging(parallel={"wrapper": "runfn"})):
            try:
                out = fn(*fnargs)
            except:
                logger.exception()
                raise
            finally:
                # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage
                wf_input_dir = os.path.join(work_dir, "wf-inputs")
                if os.path.exists(wf_input_dir) and os.path.isdir(
                        wf_input_dir):
                    shutil.rmtree(wf_input_dir)
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys,
                               input_files, work_dir)
        except:
            logger.exception()
            raise
Ejemplo n.º 24
0
def do_analysis(args, dockerconf):
    """Run a full analysis on a local machine, utilizing multiple cores.
    """
    work_dir = os.getcwd()
    with open(args.sample_config) as in_handle:
        sample_config, dmounts = mounts.update_config(yaml.load(in_handle), dockerconf["input_dir"], args.fcdir)
    dmounts += mounts.prepare_system(args.datadir, dockerconf["biodata_dir"])
    dmounts.append("%s:%s" % (work_dir, dockerconf["work_dir"]))
    system_config, system_mounts = _read_system_config(dockerconf, args.systemconfig, args.datadir)
    system_cfile = os.path.join(work_dir, "bcbio_system-forvm.yaml")
    sample_cfile = os.path.join(work_dir, "bcbio_sample-forvm.yaml")
    with open(system_cfile, "w") as out_handle:
        yaml.dump(system_config, out_handle, default_flow_style=False, allow_unicode=False)
    with open(sample_cfile, "w") as out_handle:
        yaml.dump(sample_config, out_handle, default_flow_style=False, allow_unicode=False)
    in_files = [os.path.join(dockerconf["work_dir"], os.path.basename(x)) for x in [system_cfile, sample_cfile]]
    log.setup_local_logging({"include_time": False})
    manage.run_bcbio_cmd(dockerconf["image"], dmounts + system_mounts,
                         in_files + ["--workdir=%s" % dockerconf["work_dir"]])
Ejemplo n.º 25
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel = None
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        try:
            out = fn(fnargs)
        except:
            logger.exception()
            raise
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, work_dir)
        except:
            logger.exception()
            raise
Ejemplo n.º 26
0
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = lane.process_all_lanes(lanes, run_parallel)
    pipelines = _pair_lanes_with_pipelines(lane_items)
    for pipeline, pipeline_items in pipelines.items():
        pipeline_items = _add_provenance(pipeline_items, dirs, config)
        for xs in pipeline.run(config, config_file, run_parallel, parallel,
                               dirs, pipeline_items):
            if len(xs) == 1:
                upload.from_sample(xs[0])
    qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
Ejemplo n.º 27
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir)
        argfile = os.path.join(work_dir, "cwl-%s-world.json" % args.name)
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        out = fn(fnargs)
    if argfile:
        with open(argfile, "w") as out_handle:
            if argfile.endswith(".json"):
                if parallel in ["single-split", "multi-combined", "batch-split"]:
                    json.dump([utils.to_single_data(xs) for xs in out],
                              out_handle, sort_keys=True, separators=(',', ':'))
                elif parallel in ["multi-batch"]:
                    json.dump([_collapse_to_cwl_record(xs, work_dir) for xs in out], out_handle,
                              sort_keys=True, separators=(',', ':'))
                else:
                    json.dump(utils.to_single_data(utils.to_single_data(out)), out_handle,
                              sort_keys=True, separators=(',', ':'))
            else:
                yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
Ejemplo n.º 28
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    setpath.prepend_bcbiopath()
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys, input_files = _world_from_cwl(args.name, fnargs[1:], work_dir)
        # Can remove this awkward Docker merge when we do not need custom GATK3 installs
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel, out_keys, input_files = None, {}, []
    with utils.chdir(work_dir):
        with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})):
            try:
                out = fn(*fnargs)
            except:
                logger.exception()
                raise
            finally:
                # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage
                wf_input_dir = os.path.join(work_dir, "wf-inputs")
                if os.path.exists(wf_input_dir) and os.path.isdir(wf_input_dir):
                    shutil.rmtree(wf_input_dir)
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir)
        except:
            logger.exception()
            raise
Ejemplo n.º 29
0
def calc_callable_loci(data, region=None, out_file=None):
    """Determine callable bases for input BAM using Broad's CallableLoci walker.

    http://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html
    """
    if data["config"].get("parallel", {}).get("log_queue"):
        handler = setup_local_logging(data["config"],
                                      data["config"]["parallel"])
    else:
        handler = None
    broad_runner = broad.runner_from_config(data["config"])
    if out_file is None:
        out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0]
    out_summary = "%s-callable-summary.txt" % os.path.splitext(
        data["work_bam"])[0]
    variant_regions = data["config"]["algorithm"].get("variant_regions", None)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            broad_runner.run_fn("picard_index", data["work_bam"])
            params = [
                "-T", "CallableLoci", "-R", data["sam_ref"], "-I",
                data["work_bam"], "--out", tx_out_file, "--summary",
                out_summary
            ]
            ready_region = shared.subset_variant_regions(
                variant_regions, region, tx_out_file)
            if ready_region:
                params += ["-L", ready_region]
            if ((variant_regions and ready_region
                 and os.path.isfile(ready_region)) or not variant_regions
                    or not region):
                broad_runner.run_gatk(params)
            else:
                with open(out_file, "w") as out_handle:
                    for tregion in get_ref_bedtool(data["sam_ref"],
                                                   data["config"]):
                        if tregion.chrom == region:
                            out_handle.write(
                                "%s\t%s\t%s\tNO_COVERAGE\n" %
                                (tregion.chrom, tregion.start, tregion.stop))
    if handler and hasattr(handler, "close"):
        handler.close()
    return [{
        "callable_bed": out_file,
        "config": data["config"],
        "work_bam": data["work_bam"]
    }]
Ejemplo n.º 30
0
def calc_callable_loci(data, region=None, out_file=None):
    """Determine callable bases for input BAM using Broad's CallableLoci walker.

    http://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html
    """
    if data["config"].get("parallel", {}).get("log_queue"):
        handler = setup_local_logging(data["config"], data["config"]["parallel"])
    else:
        handler = None
    broad_runner = broad.runner_from_config(data["config"])
    if out_file is None:
        out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0]
    out_summary = "%s-callable-summary.txt" % os.path.splitext(data["work_bam"])[0]
    variant_regions = data["config"]["algorithm"].get("variant_regions", None)
    # set a maximum depth to avoid calling in repetitive regions with excessive coverage
    max_depth = int(1e6 if data["config"]["algorithm"].get("coverage_depth", "").lower() == "super-high"
                    else 2.5e4)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            bam.index(data["work_bam"], data["config"])
            params = ["-T", "CallableLoci",
                      "-R", data["sam_ref"],
                      "-I", data["work_bam"],
                      "--minDepth", "0",
                      "--downsample_to_coverage", str(max_depth + 1000),
                      "--minMappingQuality", "0",
                      "--maxFractionOfReadsWithLowMAPQ", "1.1",
                      "--maxDepth", str(max_depth),
                      "--out", tx_out_file,
                      "--summary", out_summary]
            ready_region = shared.subset_variant_regions(variant_regions, region, tx_out_file)
            if ready_region:
                params += ["-L", ready_region]
            if ((variant_regions and ready_region and os.path.isfile(ready_region))
                 or not variant_regions or not region):
                broad_runner.run_gatk(params)
            else:
                with open(out_file, "w") as out_handle:
                    for tregion in get_ref_bedtool(data["sam_ref"], data["config"]):
                        if tregion.chrom == region:
                            out_handle.write("%s\t%s\t%s\tNO_COVERAGE\n" %
                                             (tregion.chrom, tregion.start, tregion.stop))
    if handler and hasattr(handler, "close"):
        handler.close()
    return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
Ejemplo n.º 31
0
 def wrapper(*args, **kwargs):
     config = None
     for arg in args:
         if ipython.is_std_config_arg(arg):
             config = arg
             break
         elif ipython.is_nested_config_arg(arg):
             config = arg["config"]
             break
     assert config, "Could not find config dictionary in function arguments."
     if config.get("parallel", {}).get("log_queue"):
         handler = setup_local_logging(config, config["parallel"])
     else:
         handler = None
     try:
         out = f(*args, **kwargs)
     finally:
         if handler and hasattr(handler, "close"):
             handler.close()
     return out
Ejemplo n.º 32
0
def calc_callable_loci(data, region=None, out_file=None):
    """Determine callable bases for input BAM using Broad's CallableLoci walker.

    http://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_sting_gatk_walkers_coverage_CallableLoci.html
    """
    if data["config"].get("parallel", {}).get("log_queue"):
        handler = setup_local_logging(data["config"], data["config"]["parallel"])
    else:
        handler = None
    broad_runner = broad.runner_from_config(data["config"])
    if out_file is None:
        out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0]
    out_summary = "%s-callable-summary.txt" % os.path.splitext(data["work_bam"])[0]
    variant_regions = data["config"]["algorithm"].get("variant_regions", None)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            bam.index(data["work_bam"], data["config"])
            params = ["-T", "CallableLoci",
                      "-R", data["sam_ref"],
                      "-I", data["work_bam"],
                      "--out", tx_out_file,
                      "--summary", out_summary]
            ready_region = shared.subset_variant_regions(variant_regions, region, tx_out_file)
            if ready_region:
                params += ["-L", ready_region]
            if ((variant_regions and ready_region and os.path.isfile(ready_region))
                 or not variant_regions or not region):
                broad_runner.run_gatk(params)
            else:
                with open(out_file, "w") as out_handle:
                    for tregion in get_ref_bedtool(data["sam_ref"], data["config"]):
                        if tregion.chrom == region:
                            out_handle.write("%s\t%s\t%s\tNO_COVERAGE\n" %
                                             (tregion.chrom, tregion.start, tregion.stop))
    if handler and hasattr(handler, "close"):
        handler.close()
    return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
Ejemplo n.º 33
0
def setup_log(config, parallel):
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
Ejemplo n.º 34
0
    for stage in config["run"]:
        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = Disambiguate(config)
            out_files = list(flatten(view.map(disambiguate, curr_files)))
            bam_files = view.map(sam.sam2bam, out_files)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)

if __name__ == "__main__":
    # read in the config file and perform initial setup
    main_config_file = sys.argv[1]
    with open(main_config_file) as config_in_handle:
        startup_config = yaml.load(config_in_handle)
    parallel = create_base_logger(startup_config, {"type": "ipython"})
    setup_local_logging(startup_config, parallel)
    startup_config["parallel"] = parallel
         #setup_logging(startup_config)

    cluster_config = startup_config["cluster"]
    cores_per_job = cluster_config.get("cores_per_job", 1)
    if startup_config["cluster"].get("local", False):
        main(startup_config, DummyView())
    else:
        with cluster_view(cluster_config["scheduler"],
                          cluster_config["queue"],
                          cluster_config["cores"],
                          cores_per_job) as view:
            main(startup_config, view)

Ejemplo n.º 35
0
    parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep")
    parser.add_argument("-t", "--paralleltype",
                        choices=["local", "ipython"],
                        default="local", help="Run with iptyhon")

    args = parser.parse_args()
    out_dir = os.path.abspath(args.out)
    utils.safe_makedir(out_dir)
    system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    with open(system_config) as in_handle:
        config = yaml.load(in_handle)
        res = {'cores': args.cores_per_job}
        config["algorithm"] = {"num_cores": args.cores_per_job}
        config["resources"].update({'sambamba': res,
                                    'samtools': res})
    parallel = clargs.to_parallel(args)
    parallel.update({'progs': ['samtools', 'sambamba']})
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = {'work': os.path.abspath(os.getcwd())}
    system.write_info(dirs, parallel, config)
    sysinfo = system.machine_info()[0]
    samples = _get_samples_to_process(args.csv, out_dir, config)
    parallel = resources.calculate(parallel, [samples], sysinfo, config)

    with prun.start(parallel, samples, config, dirs) as run_parallel:
        with profile.report("prepare bcbio samples", dirs):
            samples = run_parallel("prepare_bcbio_samples", samples)

    create_new_csv(samples, args)
                                     "bcbio_system.yaml")
    except ValueError as err:
        print(err)
        print(
            "WARNING: Attempting to read bcbio_system.yaml in the current directory."
        )
        system_config = "bcbio_system.yaml"

    with open(system_config) as in_handle:
        config = yaml.load(in_handle)
        res = {'cores': args.cores_per_job}
        config["algorithm"] = {"num_cores": args.cores_per_job}
        config["resources"].update({'sambamba': res, 'samtools': res})
        config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log")
    parallel = clargs.to_parallel(args)
    parallel.update({'progs': ['samtools', 'sambamba']})
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = {'work': os.path.abspath(os.getcwd())}
    system.write_info(dirs, parallel, config)
    sysinfo = system.machine_info()[0]
    samples = _get_samples_to_process(args.csv, out_dir, config,
                                      args.force_single, args.separators)
    parallel = resources.calculate(parallel, [samples], sysinfo, config)

    with prun.start(parallel, samples, config, dirs) as run_parallel:
        with profile.report("prepare bcbio samples", dirs):
            samples = run_parallel("prepare_bcbio_samples", samples)

    create_new_csv(samples, args)