コード例 #1
0
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = utils.R_sitelib()
    batch = sshared.get_cur_batch(items)
    ext = "-%s-cnv" % batch if batch else "-cnv"
    out_file = os.path.join(
        work_dir,
        "%s%s-%s.bed" % (os.path.splitext(os.path.basename(
            work_bams[0]))[0], ext, chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(
                    _script.format(prep_str=_prep_load_script(
                        work_bams, names, chrom, items),
                                   out_file=tx_out_file,
                                   local_sitelib=local_sitelib))
            rscript = utils.Rscript_cmd()
            try:
                do.run([rscript, "--vanilla", rcode],
                       "cn.mops CNV detection",
                       items[0],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write(
                            'track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
    return [out_file]
コード例 #2
0
def _setup_logging(args):
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg,
                        (list, tuple)) and config_utils.is_nested_config_arg(
                            arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
コード例 #3
0
def _run_bubbletree(vcf_csv, cnv_csv, data, has_normal=True):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    # BubbleTree has some internal hardcoded paramters that assume a smaller
    # distribution of log2 scores. This is not true for tumor-only calls and
    # normal contamination, so we scale the calculations to actually get calls.
    # Need a better long term solution with flexible parameters.
    lrr_scale = 1.0 if has_normal else 10.0
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
コード例 #4
0
def _get_machine_info(parallel, sys_config, dirs, config):
    """Get machine resource information from the job scheduler via either the command line or the queue.
    """
    if parallel.get("queue") and parallel.get("scheduler"):
        # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys
        sched_info_dict = {
            "slurm": _slurm_info,
            "torque": _torque_info,
            "sge": _sge_info
        }
        if parallel["scheduler"].lower() in sched_info_dict:
            try:
                return sched_info_dict[parallel["scheduler"].lower()](
                    parallel.get("queue", ""))
            except:
                # If something goes wrong, just hit the queue
                logger.exception(
                    "Couldn't get machine information from resource query function for queue "
                    "'{0}' on scheduler \"{1}\"; "
                    "submitting job to queue".format(parallel.get("queue", ""),
                                                     parallel["scheduler"]))
        else:
            logger.info(
                "Resource query function not implemented for scheduler \"{0}\"; "
                "submitting job to queue".format(parallel["scheduler"]))
    from bcbio.distributed import prun
    with prun.start(parallel, [[sys_config]], config, dirs) as run_parallel:
        return run_parallel("machine_info", [[sys_config]])
コード例 #5
0
def _setup_logging(args):
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
コード例 #6
0
def run_memory_retry(cmd, descr, data=None, check=None, region=None):
    """Run command, retrying when detecting fail due to memory errors.

    This is useful for high throughput Java jobs which fail
    intermittently due to an inability to get system resources.
    """
    max_runs = 5
    num_runs = 0
    while 1:
        try:
            run(cmd, descr, data, check, region=region, log_error=False)
            break
        except subprocess.CalledProcessError, msg:
            if num_runs < max_runs and (
                    "insufficient memory" in str(msg)
                    or "did not provide enough memory" in str(msg)
                    or "A fatal error has been detected" in str(msg)
                    or "java.lang.OutOfMemoryError" in str(msg)
                    or "Resource temporarily unavailable" in str(msg)):
                logger.info(
                    "Retrying job. Memory or resource issue with run: %s" %
                    _descr_str(descr, data, region))
                time.sleep(30)
                num_runs += 1
            else:
                logger.exception()
                raise
コード例 #7
0
ファイル: bubbletree.py プロジェクト: hliang/bcbio-nextgen
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
コード例 #8
0
def make_scrnaseq_object(samples):
    """
    load the initial se.rda object using sinclecell-experiment
    """
    local_sitelib = R_sitelib()
    counts_dir = os.path.dirname(
        dd.get_in_samples(samples, dd.get_combined_counts))
    gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf)
    if not gtf_file:
        gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    rda_file = os.path.join(counts_dir, "se.rda")
    if not file_exists(rda_file):
        with file_transaction(rda_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(rda_file)[0]
            rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0]
            rrna_file = _find_rRNA_genes(gtf_file, rrna_file)
            with open(rcode, "w") as out_handle:
                out_handle.write(_script.format(**locals()))
            rscript = Rscript_cmd()
            try:
                # do.run([rscript, "--vanilla", rcode],
                #        "SingleCellExperiment",
                #        log_error=False)
                rda_file = rcode
            except subprocess.CalledProcessError as msg:
                logger.exception()
コード例 #9
0
def _run_bubbletree(vcf_csv, cnv_csv, data, wide_lrr=False, do_plots=True,
                    handle_failures=True):
    """Create R script and run on input data

    BubbleTree has some internal hardcoded paramters that assume a smaller
    distribution of log2 scores. This is not true for tumor-only calls, so if
    we specify wide_lrr we scale the calculations to actually get calls. Need a
    better long term solution with flexible parameters.
    """
    lrr_scale = 10.0 if wide_lrr else 1.0
    local_sitelib = utils.R_sitelib()
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    do_plots = "yes" if do_plots else "no"
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        cmd = "%s && %s --no-environ %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file)
        try:
            do.run(cmd, "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError as msg:
            if handle_failures and _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
    return {"caller": "bubbletree",
            "report": freqs_out,
            "plot": {"bubble": bubbleplot_out, "track": trackplot_out}}
コード例 #10
0
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = os.path.join(
        install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
        "site-library")
    out_file = os.path.join(
        work_dir, "%s-%s-cnv.bed" % (os.path.splitext(
            os.path.basename(work_bams[0]))[0], chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(
                    _script.format(prep_str=_prep_load_script(
                        work_bams, names, chrom, items),
                                   out_file=tx_out_file,
                                   local_sitelib=local_sitelib))
            rscript = config_utils.get_program("Rscript", items[0]["config"])
            try:
                do.run([rscript, rcode],
                       "cn.mops CNV detection",
                       items[0],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write(
                            'track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
コード例 #11
0
ファイル: bubbletree.py プロジェクト: Kange2014/bcbio-nextgen
def _run_bubbletree(vcf_csv, cnv_csv, data):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(
        install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
        "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbleplot_out = "%s-bubbleplot.pdf" % base
    trackplot_out = "%s-trackplot.pdf" % base
    calls_out = "%s-calls.rds" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    sample = dd.get_sample_name(data)
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run([utils.Rscript_cmd(), r_file],
                   "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
コード例 #12
0
ファイル: do.py プロジェクト: zhangyupisa/bcbio-nextgen
def run(cmd,
        descr=None,
        data=None,
        checks=None,
        region=None,
        log_error=True,
        log_stdout=False,
        env=None):
    """Run the provided command, logging details and checking for errors.
    """
    if descr:
        descr = _descr_str(descr, data, region)
        logger.debug(descr)
    cmd_id = diagnostics.start_cmd(cmd, descr or "", data)
    try:
        logger_cl.debug(" ".join(
            str(x) for x in cmd) if not isinstance(cmd, basestring) else cmd)
        _do_run(cmd, checks, log_stdout, env=env)
    except:
        diagnostics.end_cmd(cmd_id, False)
        if log_error:
            logger.exception()
        raise
    finally:
        diagnostics.end_cmd(cmd_id)
コード例 #13
0
ファイル: do.py プロジェクト: zeneofa/bcbio
def run_memory_retry(cmd, descr, data=None, check=None, region=None):
    """Run command, retrying when detecting fail due to memory errors.

    This is useful for high throughput Java jobs which fail
    intermittently due to an inability to get system resources.
    """
    max_runs = 5
    num_runs = 0
    while 1:
        try:
            run(cmd, descr, data, check, region=region, log_error=False)
            break
        except subprocess.CalledProcessError, msg:
            if num_runs < max_runs and ("insufficient memory" in str(msg) or
                                        "did not provide enough memory" in str(msg) or
                                        "A fatal error has been detected" in str(msg) or
                                        "java.lang.OutOfMemoryError" in str(msg) or
                                        "Resource temporarily unavailable" in str(msg)):
                logger.info("Retrying job. Memory or resource issue with run: %s"
                            % _descr_str(descr, data, region))
                time.sleep(30)
                num_runs += 1
            else:
                logger.exception()
                raise
コード例 #14
0
def _setup_logging(args):
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg,
                        (list, tuple)) and config_utils.is_nested_config_arg(
                            arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
コード例 #15
0
def _setup_logging(args):
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if config_utils.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif config_utils.is_std_config_arg(arg):
            config = arg
            break
        elif isinstance(arg, (list, tuple)) and config_utils.is_nested_config_arg(arg[0]):
            config = arg[0]["config"]
            break
    if config is None:
        raise NotImplementedError("No config found in arguments: %s" % args[0])
    handler = setup_local_logging(config, config.get("parallel", {}))
    try:
        yield config
    except:
        logger.exception("Unexpected error")
        raise
    finally:
        if hasattr(handler, "close"):
            handler.close()
コード例 #16
0
ファイル: cn_mops.py プロジェクト: Tmacme/bcbio-nextgen
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    batch = sshared.get_cur_batch(items)
    ext = "-%s-cnv" % batch if batch else "-cnv"
    out_file = os.path.join(work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename(work_bams[0]))[0],
                                                       ext, chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(_script.format(prep_str=_prep_load_script(work_bams, names, chrom, items),
                                                out_file=tx_out_file,
                                                local_sitelib=local_sitelib))
            rscript = config_utils.get_program("Rscript", items[0]["config"])
            try:
                do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False)
            except subprocess.CalledProcessError, msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write('track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
コード例 #17
0
def make_scrnaseq_object(samples):
    """
    load the initial se.rda object using sinclecell-experiment
    """
    local_sitelib = R_sitelib()
    counts_dir = os.path.dirname(dd.get_in_samples(samples, dd.get_combined_counts))
    gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf)
    if not gtf_file:
        gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    rda_file = os.path.join(counts_dir, "se.rda")
    if not file_exists(rda_file):
        with file_transaction(rda_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(rda_file)[0]
            rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0]
            rrna_file = _find_rRNA_genes(gtf_file, rrna_file)
            with open(rcode, "w") as out_handle:
                out_handle.write(_script.format(**locals()))
            rscript = Rscript_cmd()
            try:
                # do.run([rscript, "--no-environ", rcode],
                #        "SingleCellExperiment",
                #        log_error=False)
                rda_file = rcode
            except subprocess.CalledProcessError as msg:
                logger.exception()
コード例 #18
0
ファイル: realign.py プロジェクト: gturco/bcbb
def gatk_indel_realignment(runner, align_bam, ref_file, intervals,
                           region=None, out_file=None, deep_coverage=False):
    """Perform realignment of BAM file in specified regions
    """
    if out_file is None:
        out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                logger.info("GATK IndelRealigner: %s %s" %
                            (os.path.basename(align_bam), region))
                params = ["-T", "IndelRealigner",
                          "-I", align_bam,
                          "-R", ref_file,
                          "-targetIntervals", intervals,
                          "-o", tx_out_file,
                          "-l", "INFO",
                          ]
                if region:
                    params += ["-L", region]
                if deep_coverage:
                    params += ["--maxReadsInMemory", "300000",
                               "--maxReadsForRealignment", str(int(5e5)),
                               "--maxReadsForConsensuses", "500",
                               "--maxConsensuses", "100"]
                try:
                    runner.run_gatk(params, tmp_dir)
                except:
                    logger.exception("Running GATK IndelRealigner failed: {} {}".format(
                        os.path.basename(align_bam), region))
                    raise
    return out_file
コード例 #19
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    locale_to_use = utils.get_locale()
    os.environ["LC_ALL"] = locale_to_use
    os.environ["LC"] = locale_to_use
    os.environ["LANG"] = locale_to_use
    setpath.prepend_bcbiopath()
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError(
            "Did not find exposed function in bcbio.distributed.multitasks named '%s'"
            % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(
            args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys, input_files = _world_from_cwl(
            args.name, fnargs[1:], work_dir)
        # Can remove this awkward Docker merge when we do not need custom GATK3 installs
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel, out_keys, input_files = None, {}, []
    with utils.chdir(work_dir):
        with contextlib.closing(
                log.setup_local_logging(parallel={"wrapper": "runfn"})):
            try:
                out = fn(*fnargs)
            except:
                logger.exception()
                raise
            finally:
                # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage
                wf_input_dir = os.path.join(work_dir, "wf-inputs")
                if os.path.exists(wf_input_dir) and os.path.isdir(
                        wf_input_dir):
                    shutil.rmtree(wf_input_dir)
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys,
                               input_files, work_dir)
        except:
            logger.exception()
            raise
コード例 #20
0
def sort(in_bam, config, order="coordinate"):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    if bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order)
    sort_file = sort_stem + ".bam"
    if not utils.file_exists(sort_file):
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, sort_file) as tx_sort_file:
            tx_sort_stem = os.path.splitext(tx_sort_file)[0]
            tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file))
            order_flag = "-n" if order == "queryname" else ""
            resources = config_utils.get_resources("samtools", config)
            mem = resources.get("memory", "2G")
            samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} "
                            "{in_bam} {tx_sort_stem}")
            if sambamba:
                if tz.get_in(["resources", "sambamba"], config):
                    sm_resources = config_utils.get_resources(
                        "sambamba", config)
                    mem = sm_resources.get("memory", "2G")
                # sambamba uses total memory, not memory per core
                mem = config_utils.adjust_memory(mem, cores,
                                                 "increase").upper()
                # Use samtools compatible natural sorting
                # https://github.com/lomereiter/sambamba/issues/132
                order_flag = "--natural-sort" if order == "queryname" else ""
                cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} "
                       "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}")
            else:
                cmd = samtools_cmd
            # sambamba has intermittent multicore failures. Allow
            # retries with single core
            try:
                do.run(
                    cmd.format(**locals()),
                    "Sort BAM file (multi core, %s): %s to %s" %
                    (order, os.path.basename(in_bam),
                     os.path.basename(sort_file)))
            except:
                logger.exception(
                    "Multi-core sorting failed, reverting to single core")
                resources = config_utils.get_resources("samtools", config)
                mem = resources.get("memory", "2G")
                cores = 1
                order_flag = "-n" if order == "queryname" else ""
                do.run(
                    samtools_cmd.format(**locals()),
                    "Sort BAM file (single core, %s): %s to %s" %
                    (order, os.path.basename(in_bam),
                     os.path.basename(sort_file)))
    return sort_file
コード例 #21
0
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {
        "cnvkit": _segment_normalized_cnvkit,
        "gatk-cnv": _segment_normalized_gatk
    }
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(
            out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](
            cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(
            paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file,
                                                      paired,
                                                      out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in [
                "GRCh37", "hg19"
            ] else dd.get_genome_build(paired.tumor_data))
            cmd = [
                "PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds",
                "%s.rds" % tx_out_base, "--sampleid",
                dd.get_sample_name(paired.tumor_data), "--genome", genome,
                "--vcf", vcf_file, "--tumor", cnr_file, "--segfile", seg_file,
                "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"
            ]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (
                    utils.R_sitelib(), utils.get_R_exports(), " ".join(
                        [str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info(
                        "PureCN failed to find solution for %s: skipping" %
                        dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base),
                                               f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
コード例 #22
0
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir, "%s%s-1.fq.gz" %
        (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, config, is_retry=True)
    else:
        return [
            x for x in [out_file_1, out_file_2]
            if x is not None and utils.file_exists(x)
        ]
コード例 #23
0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    config = data["config"]
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0)
            if prep_cmd:
                fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1)
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                if prep_cmd:
                    fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            extra_opts = " ".join([str(x) for x in resources.get("options", [])])
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, data, is_retry=True)
    else:
        return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
コード例 #24
0
ファイル: __init__.py プロジェクト: bennyyu686/bcbio-nextgen
def sort(in_bam, config, order="coordinate"):
    """Sort a BAM file, skipping if already present.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    if bam_already_sorted(in_bam, config, order):
        return in_bam

    sort_stem = _get_sort_stem(in_bam, order)
    sort_file = sort_stem + ".bam"
    if not utils.file_exists(sort_file):
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, sort_file) as tx_sort_file:
            tx_sort_stem = os.path.splitext(tx_sort_file)[0]
            tx_dir = utils.safe_makedir(os.path.dirname(tx_sort_file))
            order_flag = "-n" if order == "queryname" else ""
            resources = config_utils.get_resources("samtools", config)
            mem = resources.get("memory", "2G")
            samtools_cmd = ("{samtools} sort -@ {cores} -m {mem} {order_flag} "
                            "{in_bam} {tx_sort_stem}")
            if sambamba:
                if tz.get_in(["resources", "sambamba"], config):
                    sm_resources = config_utils.get_resources("sambamba", config)
                    mem = sm_resources.get("memory", "2G")
                # sambamba uses total memory, not memory per core
                mem = config_utils.adjust_memory(mem, cores, "increase").upper()
                # Use samtools compatible natural sorting
                # https://github.com/lomereiter/sambamba/issues/132
                order_flag = "--natural-sort" if order == "queryname" else ""
                cmd = ("{sambamba} sort -t {cores} -m {mem} {order_flag} "
                       "-o {tx_sort_file} --tmpdir={tx_dir} {in_bam}")
            else:
                cmd = samtools_cmd
            # sambamba has intermittent multicore failures. Allow
            # retries with single core
            try:
                do.run(cmd.format(**locals()),
                       "Sort BAM file (multi core, %s): %s to %s" %
                       (order, os.path.basename(in_bam),
                        os.path.basename(sort_file)))
            except:
                logger.exception("Multi-core sorting failed, reverting to single core")
                resources = config_utils.get_resources("samtools", config)
                mem = resources.get("memory", "2G")
                cores = 1
                order_flag = "-n" if order == "queryname" else ""
                do.run(samtools_cmd.format(**locals()),
                       "Sort BAM file (single core, %s): %s to %s" %
                       (order, os.path.basename(in_bam),
                        os.path.basename(sort_file)))
    return sort_file
コード例 #25
0
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError(
            "Did not find exposed function in bcbio.distributed.multitasks named '%s'"
            % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(
            args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys = _world_from_cwl(args.name, fnargs[1:],
                                                     work_dir)
        # Can remove this awkward Docker merge when we do not need custom GATK3 installs
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel, out_keys = None, {}
    with utils.chdir(work_dir):
        with contextlib.closing(
                log.setup_local_logging(parallel={"wrapper": "runfn"})):
            try:
                out = fn(fnargs)
            except:
                logger.exception()
                raise
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys,
                               work_dir)
        except:
            logger.exception()
            raise
        if argfile.endswith(".json"):
            _write_wdl_outputs(argfile, out_keys)
コード例 #26
0
ファイル: runfn.py プロジェクト: chapmanb/bcbio-nextgen
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    setpath.prepend_bcbiopath()
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys, input_files = _world_from_cwl(args.name, fnargs[1:], work_dir)
        # Can remove this awkward Docker merge when we do not need custom GATK3 installs
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel, out_keys, input_files = None, {}, []
    with utils.chdir(work_dir):
        with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})):
            try:
                out = fn(*fnargs)
            except:
                logger.exception()
                raise
            finally:
                # Clean up any copied and unpacked workflow inputs, avoiding extra disk usage
                wf_input_dir = os.path.join(work_dir, "wf-inputs")
                if os.path.exists(wf_input_dir) and os.path.isdir(wf_input_dir):
                    shutil.rmtree(wf_input_dir)
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys, input_files, work_dir)
        except:
            logger.exception()
            raise
コード例 #27
0
def gatk_indel_realignment(runner,
                           align_bam,
                           ref_file,
                           intervals,
                           region=None,
                           out_file=None,
                           deep_coverage=False):
    """Perform realignment of BAM file in specified regions
    """
    if out_file is None:
        out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                logger.info("GATK IndelRealigner: %s %s" %
                            (os.path.basename(align_bam), region))
                params = [
                    "-T",
                    "IndelRealigner",
                    "-I",
                    align_bam,
                    "-R",
                    ref_file,
                    "-targetIntervals",
                    intervals,
                    "-o",
                    tx_out_file,
                    "-l",
                    "INFO",
                ]
                if region:
                    params += ["-L", region]
                if deep_coverage:
                    params += [
                        "--maxReadsInMemory", "300000",
                        "--maxReadsForRealignment",
                        str(int(5e5)), "--maxReadsForConsensuses", "500",
                        "--maxConsensuses", "100"
                    ]
                try:
                    runner.run_gatk(params, tmp_dir)
                except:
                    logger.exception(
                        "Running GATK IndelRealigner failed: {} {}".format(
                            os.path.basename(align_bam), region))
                    raise
    return out_file
コード例 #28
0
ファイル: do.py プロジェクト: MCowperthwaite/bcbio-nextgen
def run(cmd, descr, data=None, checks=None, region=None, log_error=True):
    """Run the provided command, logging details and checking for errors.
    """
    descr = _descr_str(descr, data, region)
    logger.debug(descr)
    # TODO: Extract entity information from data input
    cmd_id = diagnostics.start_cmd(descr, data, cmd)
    try:
        logger_cl.debug(" ".join(cmd) if not isinstance(cmd, basestring) else cmd)
        _do_run(cmd, checks)
    except:
        diagnostics.end_cmd(cmd_id, False)
        if log_error:
            logger.exception()
        raise
    finally:
        diagnostics.end_cmd(cmd_id)
コード例 #29
0
ファイル: do.py プロジェクト: brentp/bcbio-nextgen
def run(cmd, descr, data=None, checks=None):
    """Run the provided command, logging details and checking for errors.
    """
    if data:
        descr = "{0} : {1}".format(descr, data["name"][-1])
    logger.debug(descr)
    # TODO: Extract entity information from data input
    cmd_id = diagnostics.start_cmd(descr, data, cmd)
    try:
        logger_cl.debug(" ".join(cmd) if not isinstance(cmd, basestring) else cmd)
        _do_run(cmd, checks)
    except:
        diagnostics.end_cmd(cmd_id, False)
        logger.exception()
        raise
    finally:
        diagnostics.end_cmd(cmd_id)
コード例 #30
0
ファイル: ipythontasks.py プロジェクト: kevyin/bcbb
def _setup_logging(args):
    if len(args) > 0:
        for check_i in [0, -1]:
            config = args[0][check_i]
            if isinstance(config, dict) and config.has_key("config"):
                config = config["config"]
                break
            elif isinstance(config, dict) and config.has_key("algorithm"):
                break
            else:
                config = None
        setup_logging(config)
    try:
        yield None
    except:
        logger.exception("Unexpected error")
        raise
コード例 #31
0
ファイル: do.py プロジェクト: cauyrd/bcbio-nextgen
def run(cmd, descr, data=None, checks=None, region=None, log_error=True,
        log_stdout=False):
    """Run the provided command, logging details and checking for errors.
    """
    descr = _descr_str(descr, data, region)
    logger.debug(descr)
    cmd_id = diagnostics.start_cmd(cmd, descr, data)
    try:
        logger_cl.debug(" ".join(str(x) for x in cmd) if not isinstance(cmd, basestring) else cmd)
        _do_run(cmd, checks, log_stdout)
    except:
        diagnostics.end_cmd(cmd_id, False)
        if log_error:
            logger.exception()
        raise
    finally:
        diagnostics.end_cmd(cmd_id)
コード例 #32
0
ファイル: ipythontasks.py プロジェクト: rwness/bcbb
def _setup_logging(args):
    if len(args) > 0:
        for check_i in [0, -1]:
            config = args[0][check_i]
            if isinstance(config, dict) and config.has_key("config"):
                config = config["config"]
                break
            elif isinstance(config, dict) and config.has_key("algorithm"):
                break
            else:
                config = None
        setup_logging(config)
    try:
        yield None
    except:
        logger.exception("Unexpected error")
        raise
コード例 #33
0
ファイル: do.py プロジェクト: yangjl/bcbio-nextgen
def run(cmd, descr, data, checks=None):
    """Run the provided command, logging details and checking for errors.
    """
    if data:
        descr = "{0} : {1}".format(descr, data["name"][-1])
    logger.debug(descr)
    # TODO: Extract entity information from data input
    cmd_id = diagnostics.start_cmd(descr, data, cmd)
    try:
        logger_cl.debug(
            " ".join(cmd) if not isinstance(cmd, basestring) else cmd)
        _do_run(cmd, checks)
    except:
        diagnostics.end_cmd(cmd_id, False)
        logger.exception()
        raise
    finally:
        diagnostics.end_cmd(cmd_id)
コード例 #34
0
def run(cmd, descr, data=None, checks=None, region=None, log_error=True,
        log_stdout=False):
    """Run the provided command, logging details and checking for errors.
    """
    descr = _descr_str(descr, data, region)
    logger.debug(descr)
    # TODO: Extract entity information from data input
    cmd_id = diagnostics.start_cmd(descr, data, cmd)
    try:
        logger_cl.debug(" ".join(str(x) for x in cmd) if not isinstance(cmd, basestring) else cmd)
        _do_run(cmd, checks, log_stdout)
    except:
        diagnostics.end_cmd(cmd_id, False)
        if log_error:
            logger.exception()
        raise
    finally:
        diagnostics.end_cmd(cmd_id)
コード例 #35
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                                      os.path.dirname(tx_out_file)))
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(str(msg)):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
コード例 #36
0
ファイル: lumpy.py プロジェクト: chapmanb/bcbio-nextgen
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                                      os.path.dirname(tx_out_file)))
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(msg):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                 samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
コード例 #37
0
ファイル: runfn.py プロジェクト: DoaneAS/bcbio-nextgen
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys = _world_from_cwl(args.name, fnargs[1:], work_dir)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel, out_keys = None, {}
    with utils.chdir(work_dir):
        with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})):
            try:
                out = fn(fnargs)
            except:
                logger.exception()
                raise
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys, work_dir)
        except:
            logger.exception()
            raise
        if argfile.endswith(".json"):
            _write_wdl_outputs(argfile, out_keys)
コード例 #38
0
ファイル: alignprep.py プロジェクト: tfmorris/bcbio-nextgen
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            try:
                do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError, msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core."
                                % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
コード例 #39
0
ファイル: purecn.py プロジェクト: chapmanb/bcbio-nextgen
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk}
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"]
                      else dd.get_genome_build(paired.tumor_data))
            cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base,
                   "--sampleid", dd.get_sample_name(paired.tumor_data),
                   "--genome", genome,
                   "--vcf", vcf_file, "--tumor", cnr_file,
                   "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(), utils.get_R_exports(),
                                                             " ".join([str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info("PureCN failed to find solution for %s: skipping" %
                                dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
コード例 #40
0
def _setup_logging(args):
    config = None
    if len(args) == 1 and isinstance(args[0], (list, tuple)):
        args = args[0]
    for arg in args:
        if ipython.is_nested_config_arg(arg):
            config = arg["config"]
            break
        elif ipython.is_std_config_arg(arg):
            config = arg
            break
    if config is not None:
        setup_logging(config)
    else:
        raise NotImplementedError("No config in %s:" % args[0])
    try:
        yield None
    except:
        logger.exception("Unexpected error")
        raise
コード例 #41
0
ファイル: runfn.py プロジェクト: jielovedata/bcbio-nextgen
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError(
            "Did not find exposed function in bcbio.distributed.multitasks named '%s'"
            % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(
            args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel = None
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        try:
            out = fn(fnargs)
        except:
            logger.exception()
            raise
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, work_dir)
        except:
            logger.exception()
            raise
コード例 #42
0
ファイル: bubbletree.py プロジェクト: nickholz/bcbio-nextgen
def _run_bubbletree(vcf_csv, cnv_csv, data):
    """Create R script and run on input data
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library")
    base = utils.splitext_plus(vcf_csv)[0]
    r_file = "%s-run.R" % base
    bubbles_out = "%s-bubbles.pdf" % base
    prev_model_out = "%s-bubbletree_prev_model.pdf" % base
    freqs_out = "%s-bubbletree_prevalence.txt" % base
    with open(r_file, "w") as out_handle:
        out_handle.write(_script.format(**locals()))
    if not utils.file_exists(freqs_out):
        try:
            do.run(["Rscript", r_file], "Assess heterogeneity with BubbleTree")
        except subprocess.CalledProcessError, msg:
            if _allowed_bubbletree_errorstates(str(msg)):
                with open(freqs_out, "w") as out_handle:
                    out_handle.write('bubbletree failed:\n %s"\n' % (str(msg)))
            else:
                logger.exception()
                raise
コード例 #43
0
ファイル: runfn.py プロジェクト: vivianjie/bcbio-nextgen
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    """
    try:
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
    else:
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel = _world_from_cwl(fnargs[1:], work_dir)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
    else:
        parallel = None
    with utils.chdir(work_dir):
        log.setup_local_logging(parallel={"wrapper": "runfn"})
        try:
            out = fn(fnargs)
        except:
            logger.exception()
            raise
    if argfile:
        try:
            _write_out_argfile(argfile, out, fnargs, parallel, work_dir)
        except:
            logger.exception()
            raise
コード例 #44
0
ファイル: system.py プロジェクト: jmchilton/bcbio-nextgen
def _get_machine_info(parallel, sys_config, dirs, config):
    """Get machine resource information from the job scheduler via either the command line or the queue.
    """
    if parallel.get("queue") and parallel.get("scheduler"):
        # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys
        sched_info_dict = {
                            "slurm": _slurm_info,
                            "torque": _torque_info,
                            "sge": _sge_info
                          }
        if parallel["scheduler"].lower() in sched_info_dict:
            try:
                return sched_info_dict[parallel["scheduler"].lower()](parallel.get("queue", ""))
            except:
                # If something goes wrong, just hit the queue
                logger.exception("Couldn't get machine information from resource query function for queue "
                                 "'{0}' on scheduler \"{1}\"; "
                                 "submitting job to queue".format(parallel.get("queue", ""), parallel["scheduler"]))
        else:
            logger.info("Resource query function not implemented for scheduler \"{0}\"; "
                         "submitting job to queue".format(parallel["scheduler"]))
    from bcbio.distributed import prun
    with prun.start(parallel, [[sys_config]], config, dirs) as run_parallel:
        return run_parallel("machine_info", [[sys_config]])
コード例 #45
0
def stop(view):
    try:
        ipython_cluster.stop_from_view(view)
        time.sleep(10)
    except:
        logger.exception("Did not stop IPython cluster correctly")
コード例 #46
0
ファイル: ipython.py プロジェクト: dh10/bcbio-nextgen
def stop(view):
    try:
        ipython_cluster.stop_from_view(view)
        time.sleep(10)
    except:
        logger.exception("Did not stop IPython cluster correctly")