Beispiel #1
0
def bgzip_and_index(in_file, config=None, remove_orig=True, prep_cmd="", tabix_args=None, out_dir=None):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    if config is None:
        config = {}
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if out_dir:
        remove_orig = False
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if (not utils.file_exists(out_file) or not os.path.lexists(out_file)
          or (utils.file_exists(in_file) and not utils.file_uptodate(out_file, in_file))):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        assert os.path.exists(in_file), "Input file %s not found" % in_file
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(config, out_file) as tx_out_file:
                bgzip = tools.get_bgzip_cmd(config)
                cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
                if prep_cmd:
                    prep_cmd = "| %s " % prep_cmd
                cmd = "{cat_cmd} {in_file} {prep_cmd} | {bgzip} -c > {tx_out_file}"
                try:
                    do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
                except subprocess.CalledProcessError:
                    # Race conditions: ignore errors where file has been deleted by another
                    if os.path.exists(in_file) and not os.path.exists(out_file):
                        raise
            if remove_orig:
                try:
                    os.remove(in_file)
                except OSError:  # Handle cases where run in parallel and file has been deleted
                    pass
    tabix_index(out_file, config, tabix_args=tabix_args)
    return out_file
Beispiel #2
0
def _run_bwa_align(fastq_file, ref_file, out_file, config):
    aln_cl = [config_utils.get_program("bwa", config), "aln",
              "-n 2", "-k 2"]
    aln_cl += _bwa_args_from_config(config)
    aln_cl += [ref_file, fastq_file]
    cmd = "{cl} > {out_file}".format(cl=" ".join(aln_cl), out_file=out_file)
    do.run(cmd, "bwa aln: {f}".format(f=os.path.basename(fastq_file)), None)
Beispiel #3
0
def _run_on_chrom(chrom, work_bams, names, work_dir, items):
    """Run cn.mops on work BAMs for a specific chromosome.
    """
    local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                 "lib", "R", "site-library")
    batch = sshared.get_cur_batch(items)
    ext = "-%s-cnv" % batch if batch else "-cnv"
    out_file = os.path.join(work_dir, "%s%s-%s.bed" % (os.path.splitext(os.path.basename(work_bams[0]))[0],
                                                       ext, chrom if chrom else "all"))
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            rcode = "%s-run.R" % os.path.splitext(out_file)[0]
            with open(rcode, "w") as out_handle:
                out_handle.write(_script.format(prep_str=_prep_load_script(work_bams, names, chrom, items),
                                                out_file=tx_out_file,
                                                local_sitelib=local_sitelib))
            rscript = config_utils.get_program("Rscript", items[0]["config"])
            try:
                do.run([rscript, rcode], "cn.mops CNV detection", items[0], log_error=False)
            except subprocess.CalledProcessError, msg:
                # cn.mops errors out if no CNVs found. Just write an empty file.
                if _allowed_cnmops_errorstates(str(msg)):
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write('track name=empty description="No CNVs found"\n')
                else:
                    logger.exception()
                    raise
Beispiel #4
0
def _run_delly(bam_files, chrom, sv_type, ref_file, work_dir, items):
    """Run delly, calling structural variations for the specified type.
    """
    out_file = os.path.join(work_dir, "%s-svs%s-%s.vcf"
                            % (os.path.splitext(os.path.basename(bam_files[0]))[0], sv_type, chrom))
    cores = min(utils.get_in(items[0], ("config", "algorithm", "num_cores"), 1),
                len(bam_files))
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            if not _has_variant_regions(items, out_file, chrom):
                vcfutils.write_empty_vcf(tx_out_file)
            else:
                exclude = ["-x", prepare_exclude_file(items, out_file, chrom)]
                cmd = ["delly", "-t", sv_type, "-g", ref_file, "-o", tx_out_file] + exclude + bam_files
                multi_cmd = "export OMP_NUM_THREADS=%s && " % cores
                try:
                    do.run(multi_cmd + " ".join(cmd), "delly structural variant")
                    # Delly will write nothing if no variants found
                    if not utils.file_exists(tx_out_file):
                        vcfutils.write_empty_vcf(tx_out_file)
                except subprocess.CalledProcessError, msg:
                    # delly returns an error exit code if there are no variants
                    if "No structural variants found" in str(msg):
                        vcfutils.write_empty_vcf(tx_out_file)
                    else:
                        raise
Beispiel #5
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    bwa = config_utils.get_program("bwa", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease")
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7)
                  if data.get("analysis", "").lower() not in ["standard"]
                  else None)
        bam_file = ds_bam if ds_bam else bam_file
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [config_utils.get_program("fastqc", data["config"]),
                      "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                fastqc_outdir = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0])
                if os.path.exists("%s.zip" % fastqc_outdir):
                    os.remove("%s.zip" % fastqc_outdir)
                if not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(fastqc_outdir, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
            os.remove(ds_bam)
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Beispiel #7
0
def align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
          names=None):
    """Perform a BWA alignment, generating a SAM file.
    """
    config = data["config"]
    sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base)
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base)
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % out_base)
    if not utils.file_exists(sam_file):
        if not utils.file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
        if sai2_file and not utils.file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
        align_type = "sampe" if sai2_file else "samse"
        sam_cl = [config_utils.get_program("bwa", config), align_type, ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)
        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)
        with file_transaction(sam_file) as tx_sam_file:
            cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file)
            do.run(cmd, "bwa {align_type}".format(**locals()), None)
    return sam_file
Beispiel #8
0
def convert_to_kallisto(data):
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq")
    out_file = os.path.join(kallisto_dir, "barcodes.batch")
    umis = config_utils.get_program("umis", dd.get_config(data))
    if file_exists(out_file):
        return out_file
    if dd.get_minimum_barcode_depth(data):
        cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt")
        cb_cutoff = dd.get_minimum_barcode_depth(data)
        cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}"
        cb_options = cb_options.format(**locals())
    else:
        cb_options = ""
    cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}")
    with file_transaction(data, kallisto_dir) as tx_kallisto_dir:
        safe_makedir(tx_kallisto_dir)
        message = ("Transforming %s to Kallisto singlecell format. "
                   % fq1)
        do.run(cmd.format(**locals()), message)
    return out_file
def _run_kraken(data,ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio))
    logger.info("Running kraken to determine contaminant: %s" % str(data["name"]))
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    stats = out = out_stats = None
    db = data['config']["algorithm"]["kraken"] 
    if db == "minikraken":
        db = os.path.join(_get_data_dir(),"genome","kraken","minikraken")
    else:
        if not os.path.exists(db):
            logger.info("kraken: no database found %s, skipping" % db)
            return {"kraken_report" : "null"}
    if not os.path.exists(os.path.join(kraken_out,"kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        files = data["files"]        
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir,"kraken_out")
                out_stats = os.path.join(tx_tmp_dir,"kraken_stats")
                cl = (" ").join([config_utils.get_program("kraken", data["config"]),
                      "--db",db,"--quick",
                      "--preload","--min-hits","2","--threads",str(num_cores), 
                      "--out", out, files[0]," 2>",out_stats])
                do.run(cl,"kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                    shutil.rmtree(kraken_out)
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out,db,data)
    return metrics
Beispiel #10
0
def slim_vcf(in_file, data):
    """Remove larger annotations which slow down VCF processing
    """
    to_remove = ["ANN", "LOF"]
    to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove])
    in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False)
    out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        cur_remove = []
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith("#"):
                    break
                elif line.startswith(to_remove_str):
                    cur_id = line.split("ID=")[-1].split(",")[0]
                    cur_remove.append("INFO/%s" % cur_id)
        with file_transaction(data, out_file) as tx_out_file:
            if cur_remove:
                cur_remove = ",".join(cur_remove)
                cmd = ("bcftools view -f 'PASS,.' {in_file} | "
                       "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}")
            else:
                cmd = ("bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}")
            do.run(cmd.format(**locals()), "Create slim VCF")
    return out_file
Beispiel #11
0
def run(name, chip_bam, input_bam, genome_build, out_dir, config):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        return out_file
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(config_utils.get_resources("macs2", config).get("options", ""))
    if genome_build not in HS and options.find("-g") == -1:
        raise ValueError("This %s genome doesn't have a pre-set value."
                          "You can add specific values using resources "
                          "option for macs2 in the YAML file (-g genome_size)."
                          "Check Chip-seq configuration in "
                          "bcbio-nextgen documentation.")

    genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build]
    with utils.chdir(out_dir):
        cmd = _macs2_cmd()
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    return out_file
Beispiel #12
0
def bgzip_and_index(in_file, config, remove_orig=True, prep_cmd=""):
    """bgzip and tabix index an input file, handling VCF and BED.
    """
    out_file = in_file if in_file.endswith(".gz") else in_file + ".gz"
    if not utils.file_exists(out_file) or not os.path.lexists(out_file):
        assert not in_file == out_file, "Input file is bgzipped but not found: %s" % in_file
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            if prep_cmd:
                cmd = "cat {in_file} | {prep_cmd} | {bgzip} -c > {tx_out_file}"
            else:
                cmd = "{bgzip} -c {in_file} > {tx_out_file}"
            try:
                do.run(cmd.format(**locals()), "bgzip %s" % os.path.basename(in_file))
            except subprocess.CalledProcessError:
                # Race conditions: ignore errors where file has been deleted by another
                if os.path.exists(in_file) and not os.path.exists(out_file):
                    raise
        if remove_orig:
            try:
                os.remove(in_file)
            except OSError:  # Handle cases where run in parallel and file has been deleted
                pass
    tabix_index(out_file, config)
    return out_file
Beispiel #13
0
def _bgzip_from_bam(bam_file, dirs, config):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default
    bgzip = _get_bgzip_cmd(config)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0])
    if bam.is_paired(bam_file):
        out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    else:
        out_file_2 = None
    if not utils.file_exists(out_file_1):
        with file_transaction(out_file_1) as tx_out_file:
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                           "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str
            do.run(cmd.format(**locals()), "BAM to bgzipped fastq",
                   checks=[do.file_reasonable_size(tx_out_file, bam_file)])
    return [x for x in [out_file_1, out_file_2] if x is not None]
Beispiel #14
0
def merge_overlaps(in_file, data, distance=None, out_dir=None):
    """Merge bed file intervals to avoid overlapping regions.

    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    config = data["config"]
    if in_file:
        bedtools = config_utils.get_program("bedtools", config,
                                            default="bedtools")
        work_dir = tz.get_in(["dirs", "work"], data)
        if out_dir:
            bedprep_dir = out_dir
        elif work_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep"))
        else:
            bedprep_dir = os.path.dirname(in_file)
        out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0]))
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(data, out_file) as tx_out_file:
                distance = "-d %s" % distance if distance else ""
                cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare merged BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Beispiel #15
0
def _bgzip_file(in_file, config, work_dir, needs_bgzip, needs_gunzip, needs_convert):
    """Handle bgzip of input file, potentially gunzipping an existing file.
    """
    out_file = os.path.join(work_dir, os.path.basename(in_file) +
                            (".gz" if not in_file.endswith(".gz") else ""))
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            bgzip = tools.get_bgzip_cmd(config)
            is_remote = objectstore.is_remote(in_file)
            in_file = objectstore.cl_input(in_file, unpack=needs_gunzip or needs_convert or needs_bgzip)
            if needs_convert:
                in_file = fastq_convert_pipe_cl(in_file, {"config": config})
            if needs_gunzip and not needs_convert:
                gunzip_cmd = "gunzip -c {in_file} |".format(**locals())
                bgzip_in = "/dev/stdin"
            else:
                gunzip_cmd = ""
                bgzip_in = in_file
            if needs_bgzip:
                do.run("{gunzip_cmd} {bgzip} -c {bgzip_in} > {tx_out_file}".format(**locals()),
                       "bgzip input file")
            elif is_remote:
                bgzip = "| bgzip -c" if needs_convert else ""
                do.run("cat {in_file} {bgzip} > {tx_out_file}".format(**locals()), "Get remote input")
            else:
                raise ValueError("Unexpected inputs: %s %s %s %s" % (in_file, needs_bgzip,
                                                                     needs_gunzip, needs_convert))
    return out_file
Beispiel #16
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with utils.curdir_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | "
                   "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"],
                                               ref_file, config)
    return ann_file
Beispiel #18
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items),
                   "--bamFiles=%s" % ",".join(align_bams),
                   "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                   "--logFileName", "/dev/null", "--verbosity=1"]
            cmd += ["--assemble=1"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
                    "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
                    "--minVarFreq", "0.0"]
            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True)
                   for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = " | %s | vcfallelicprimitives | vcfstreamsort | bgzip -c > %s" % (
                vcfutils.fix_ambiguous_cl(), tx_out_file)
            do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Beispiel #19
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Beispiel #20
0
def _piped_input_cl(data, region, tmp_dir, out_base_file, prep_params):
    """Retrieve the commandline for streaming input into preparation step.
    If marking duplicates, this requires writing an intermediate file since
    MarkDuplicates uses multiple passed on an input.
    """
    broad_runner = broad.runner_from_config(data["config"])
    cl = _gatk_extract_reads_cl(data, region, prep_params, tmp_dir)
    if prep_params["dup"] == "picard":
        sel_file = "%s-select%s" % os.path.splitext(out_base_file)
        if not utils.file_exists(sel_file):
            with file_transaction(sel_file) as tx_out_file:
                cl += ["-o", tx_out_file]
                do.run(cl, "GATK: PrintReads {0}".format(region), data)
        dup_metrics = "%s-dup.dup_metrics" % os.path.splitext(out_base_file)[0]
        compression = "5" if prep_params["realign"] == "gatk" else "0"
        cl = broad_runner.cl_picard("MarkDuplicates",
                                    [("INPUT", sel_file),
                                     ("OUTPUT", "/dev/stdout"),
                                     ("METRICS_FILE", dup_metrics),
                                     ("PROGRAM_RECORD_ID", "null"),
                                     ("COMPRESSION_LEVEL", compression),
                                     ("TMP_DIR", tmp_dir)])
    elif not prep_params["dup"]:
        sel_file = data["work_bam"]
    else:
        raise ValueError("Duplication approach not supported with GATK: %s" % prep_params["dup"])
    broad_runner.run_fn("picard_index", sel_file)
    return sel_file, " ".join(cl)
Beispiel #21
0
def run(calls, data):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + [
        "--sample",
        dd.get_sample_name(data),
        "--reference",
        dd.get_ref_file(data),
        "--bam",
        dd.get_align_bam(data),
        "--outdir",
        work_dir,
    ]
    available_callers = 0
    for call in calls:
        if call["variantcaller"] in SUPPORTED:
            available_callers += 1
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if available_callers >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(
                dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")
            )
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        calls.append({"variantcaller": "metasv", "vrn_file": out_file})
    return calls
Beispiel #22
0
def genebody_coverage2(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts, takes a bam file,
    converts it to bigwig and then uses that
    """
    PROGRAM = "geneBody_coverage2.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    in_bigwig = bam2bigwig(in_file, config)
    prefix = "coverage"
    out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage")
    safe_makedir(out_dir)
    out_prefix = out_dir + "/wiggle"
    #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf"))
    do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None)
    return coverage_plot_file
Beispiel #23
0
def _run_amber(paired, work_dir, lenient=False):
    """AMBER: calculate allele frequencies at likely heterozygous sites.

    lenient flag allows amber runs on small test sets.
    """
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            key = "germline_het_pon"
            het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
            cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor", dd.get_sample_name(paired.tumor_data),
                   "-tumor_bam", dd.get_align_bam(paired.tumor_data),
                   "-reference", dd.get_sample_name(paired.normal_data),
                   "-reference_bam", dd.get_align_bam(paired.normal_data),
                   "-ref_genome", dd.get_ref_file(paired.tumor_data),
                   "-bed", het_bed,
                   "-output_dir", os.path.dirname(tx_out_file)]
            if lenient:
                cmd += ["-max_het_af_percent", "1.0"]
            try:
                do.run(cmd, "PURPLE: AMBER baf generation")
            except subprocess.CalledProcessError as msg:
                if not lenient and _amber_allowed_errors(str(msg)):
                    return _run_amber(paired, work_dir, True)
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(amber_dir, f))
    return out_file
Beispiel #24
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with utils.curdir_tmpdir() as tmpdir:
            with file_transaction(sr_file) as tx_sr_file:
                with file_transaction(disc_file) as tx_disc_file:
                    with file_transaction(dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | "
                               "{samtools} view -h - | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Beispiel #25
0
def _amber_het_file(method, vrn_files, work_dir, paired):
    """Create file of BAFs in normal heterozygous positions compatible with AMBER.

    Two available methods:
      - pon -- Use panel of normals with likely heterozygous sites.
      - variants -- Use pre-existing variant calls, filtered to likely heterozygotes.

    https://github.com/hartwigmedical/hmftools/tree/master/amber
    https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
    """
    assert vrn_files, "Did not find compatible variant calling files for PURPLE inputs"
    from bcbio.heterogeneity import bubbletree

    if method == "variants":
        amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
        out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
        prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"],
                                             work_dir, paired, AmberWriter)
        utils.symlink_plus(prep_file, out_file)
        pcf_file = out_file + ".pcf"
        if not utils.file_exists(pcf_file):
            with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:
                r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R")
                with open(r_file, "w") as out_handle:
                    out_handle.write(_amber_seg_script)
                cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file,
                                                          out_file, pcf_file)
                do.run(cmd, "PURPLE: AMBER baf segmentation")
    else:
        assert method == "pon"
        out_file = _run_amber(paired, work_dir)
    return out_file
Beispiel #26
0
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="",
                      extra_cmd="", limit_regions="variant_regions"):
    """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            if vcfutils.vcf_has_variants(vcf_file):
                bcftools = config_utils.get_program("bcftools", data["config"])
                bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else ""
                intervals = ""
                if limit_regions == "variant_regions":
                    variant_regions = dd.get_variant_regions(data)
                    if variant_regions:
                        intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"])
                cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' "
                       "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data)
            else:
                shutil.copy(vcf_file, out_file)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Beispiel #27
0
    def run_gatk(self, params, tmp_dir=None, log_error=True,
                 data=None, region=None, memscale=None, parallel_gc=False, ld_preload=False):
        """Top level interface to running a GATK command.

        ld_preload injects required libraries for Java JNI calls:
        https://gatkforums.broadinstitute.org/gatk/discussion/8810/something-about-create-pon-workflow
        """
        needs_java7 = LooseVersion(self.get_gatk_version()) < LooseVersion("3.6")
        # For old Java requirements use global java 7
        if needs_java7:
            setpath.remove_bcbiopath()
        with tx_tmpdir(self._config) as local_tmp_dir:
            if tmp_dir is None:
                tmp_dir = local_tmp_dir
            cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc)
            atype_index = params.index("-T") if params.count("-T") > 0 \
                          else params.index("--analysis_type")
            prog = params[atype_index + 1]
            cl = fix_missing_spark_user(cl, prog, params)
            if ld_preload:
                cl = "export LD_PRELOAD=%s/lib/libopenblas.so && %s" % (os.path.dirname(utils.get_bcbio_bin()), cl)
            do.run(cl, "GATK: {0}".format(prog), data, region=region,
                   log_error=log_error)
        if needs_java7:
            setpath.prepend_bcbiopath()
Beispiel #28
0
def _run_cobalt(paired, work_dir):
    """Run Cobalt for counting read depth across genomic windows.

    PURPLE requires even 1000bp windows so use integrated counting solution
    directly rather than converting from CNVkit calculations. If this approach
    is useful should be moved upstream to be available to other tools as
    an input comparison.

    https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines
    """
    cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt"))
    out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam,
                   "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]]
            cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd]))
            do.run(cmd, "PURPLE: COBALT read depth normalization")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(cobalt_dir, f))
    return out_file
Beispiel #29
0
 def run_mutect(self, params, tmp_dir=None):
     with tx_tmpdir(self._config) as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_mutect(params, tmp_dir)
         prog = "MuTect"
         do.run(cl, "MuTect: {0}".format(prog), None)
Beispiel #30
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(lambda x: x is not None,
                            list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items])))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #31
0
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, data)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError("Tophat versions < 2.0 are not supported, please "
                                  "download the newest version of Tophat here: "
                                  "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.bam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(config, out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file,
                                                        ref_file, out_base,
                                                        tx_out_dir, data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            tophat_runner = sh.Command(config_utils.get_program("tophat",
                                                                config))
            ready_options = {}
            for k, v in options.iteritems():
                ready_options[k.replace("-", "_")] = v
            # tophat requires options before arguments,
            # otherwise it silently ignores them
            tophat_ready = tophat_runner.bake(**ready_options)
            cmd = "%s %s" % (sys.executable, str(tophat_ready.bake(*files)))
            do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None)
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed_unmapped = _fix_unmapped(fixed, unmapped, data)
    fixed = merge_unmapped(fixed, fixed_unmapped, config)
    fixed = _add_rg(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_path("picard", config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"],
                          os.path.splitext(fixed)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out
Beispiel #32
0
def run(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/smallRNA-seq/QC pipeline.

    Handles fastqc 0.11+, which use a single HTML file and older versions that use
    a directory of files + images. The goal is to eventually move to only 0.11+
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir)
                  if data.get("analysis", "").lower()
                  not in ["standard", "smallrna-seq"] else None)
        bam_file = ds_bam if ds_bam else bam_file
        frmt = "bam" if bam_file.endswith("bam") else "fastq"
        fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0]
        fastqc_clean_name = dd.get_sample_name(data)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [
                    config_utils.get_program("fastqc", data["config"]), "-d",
                    tx_tmp_dir, "-t",
                    str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt,
                    bam_file
                ]
                do.run(cl, "FastQC: %s" % dd.get_sample_name(data))
                tx_fastqc_out = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % fastqc_name)
                tx_combo_file = os.path.join(tx_tmp_dir,
                                             "%s_fastqc.html" % fastqc_name)
                if not os.path.exists(sentry_file) and os.path.exists(
                        tx_combo_file):
                    utils.safe_makedir(fastqc_out)
                    # Use sample name for reports instead of bam file name
                    with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \
                            open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name:
                        for line in fastqc_bam_name:
                            fastqc_sample_name.write(
                                line.replace(os.path.basename(bam_file),
                                             fastqc_clean_name))
                    shutil.move(
                        os.path.join(tx_fastqc_out, "_fastqc_data.txt"),
                        os.path.join(fastqc_out, 'fastqc_data.txt'))
                    shutil.move(tx_combo_file, sentry_file)
                    if os.path.exists("%s.zip" % tx_fastqc_out):
                        shutil.move(
                            "%s.zip" % tx_fastqc_out,
                            os.path.join(fastqc_out,
                                         "%s.zip" % fastqc_clean_name))
                elif not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(tx_fastqc_out, fastqc_out)
    parser = FastQCParser(fastqc_out, dd.get_sample_name(data))
    stats = parser.get_fastqc_summary()
    parser.save_sections_into_file()
    return stats
Beispiel #33
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = utils.local_path_export()
            cmd = (
                "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                "--skip-duplicated --skip-dup-mode 0 "
                "-nt {num_cores} --java-mem-size={max_mem} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                    or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [
                None, False, "None"
            ] else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(
                    bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()),
                   "Qualimap: %s" % dd.get_sample_name(data),
                   env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir,
                                           "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file)
    }
Beispiel #34
0
def _run_bwa_align(fastq_file, ref_file, out_file, config):
    aln_cl = [config_utils.get_program("bwa", config), "aln", "-n 2", "-k 2"]
    aln_cl += _bwa_args_from_config(config)
    aln_cl += [ref_file, fastq_file]
    cmd = "{cl} > {out_file}".format(cl=" ".join(aln_cl), out_file=out_file)
    do.run(cmd, "bwa aln: {f}".format(f=os.path.basename(fastq_file)), None)
Beispiel #35
0
def _compres_bdg_files(out_dir):
    for fn in glob.glob(os.path.join(out_dir, "*bdg")):
        cmd = "gzip  %s" % fn
        do.run(cmd, "compress bdg file: %s" % fn)
Beispiel #36
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis:
       1. Extract the unmapped reads
       2. BWA-MEM to the viral sequences from GDC database https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files
       3. Report viruses that are in more than 50% covered by at least 5x
    """
    source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
    viral_target = "gdc-viral"
    out = {}
    viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
    if viral_refs and utils.file_exists(viral_refs[0]):
        viral_ref = viral_refs[0]
        viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                    "%s-%s.bam" % (dd.get_sample_name(data),
                                                utils.splitext_plus(os.path.basename(viral_ref))[0]))
        out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0]
        cores = dd.get_num_cores(data)
        if not utils.file_uptodate(out_file, bam_file):
            if not utils.file_uptodate(viral_bam, bam_file):
                with file_transaction(data, viral_bam) as tx_out_file:
                    tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                    cmd = ("samtools view -u -f 4 {bam_file} | "
                            "bamtofastq collate=0 | "
                            "bwa mem -t {cores} {viral_ref} - | "
                            "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                            "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                    do.run(cmd.format(**locals()), "Align unmapped reads to viral genome")

            total_reads = _count_reads(bam_file)
            assert total_reads > 0, 'Reads count is {total_reads}, is there a bug in counting the read count? {bam_file}'.format(**locals())
            with file_transaction(data, out_file) as tx_out_file:
                sample_name = dd.get_sample_name(data)
                mosdepth_prefix = os.path.splitext(viral_bam)[0]
                cmd = ("mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
                       "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
                       "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
                       "echo '## Sample: {sample_name}' >> {tx_out_file} && "
                       "echo '#virus\tsize\tdepth\t1x\t5x\t25x\treads\treads_pct' >> {tx_out_file} && "
                       "paste "
                       "<(zcat {mosdepth_prefix}.regions.bed.gz) "
                       "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) "
                       "<(samtools idxstats {viral_bam} | grep -v '*') | "
                       "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | "
                       "sort -n -r -k 5,5 >> {tx_out_file}")
                do.run(cmd.format(**locals()), "Analyse coverage of viral genomes")
                if chromhacks.get_EBV(data):
                    ref_file = dd.get_ref_file(data)
                    work_bam = dd.get_work_bam(data)
                    ebv = chromhacks.get_EBV(data)
                    mosdepth_prefix = os.path.splitext(work_bam)[0] + "-EBV"
                    cmd = ("mosdepth -t {cores} {mosdepth_prefix} {work_bam} -n --thresholds 1,5,25 --by "
                            "<(grep {ebv} {ref_file}.fai | awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}') && "
                            "paste "
                            "<(zcat {mosdepth_prefix}.regions.bed.gz) "
                            "<(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) "
                            "<(samtools idxstats {work_bam} | grep {ebv}) | "
                            "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3 FS $15 FS $15/{total_reads}}}' | "
                            "sort -n -r -k 5,5 >> {tx_out_file}")
                    do.run(cmd.format(**locals()), "Analyse coverage of EBV")

        out["base"] = out_file
        out["secondary"] = []
    return out
Beispiel #37
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug(
            "multiqc not found. Update bcbio_nextgen.py tools to fix this issue."
        )
    folders = []
    opts = ""
    out_dir = os.path.join(work_dir, "multiqc")
    out_data = os.path.join(work_dir, "multiqc", "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    samples = _report_summary(samples, os.path.join(out_dir, "report"))
    for data in samples:
        for program, pfiles in tz.get_in(["summary", "qc"], data,
                                         {}).iteritems():
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif isinstance(pfiles, basestring):
                pfiles = [pfiles]
            folders.extend(pfiles)
    # XXX temporary workaround until we can handle larger inputs through MultiQC
    folders = list(set(folders))
    # Back compatible -- to migrate to explicit specifications in input YAML
    folders += ["trimmed", "htseq-count/*summary"]
    if not utils.file_exists(out_file):
        with utils.chdir(work_dir):
            input_dir = [_check_multiqc_input(d) for d in folders]
            input_dir = _create_list_file(input_dir)
            export_tmp = ""
            if dd.get_tmp_dir(samples[0]):
                export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
            if input_dir:
                cmd = "{export_tmp} {multiqc} -f -l {input_dir} -o {tx_out} {opts}"
                with tx_tmpdir(data, work_dir) as tx_out:
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(
                            os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(
                            os.path.join(tx_out, "multiqc_report.html"),
                            out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"),
                                    out_data)
    out = []
    for i, data in enumerate(samples):
        if i == 0:
            if utils.file_exists(out_file):
                data_files = glob.glob(
                    os.path.join(out_dir, "multiqc_data", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.bed"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.tsv"))
                data_files += glob.glob(os.path.join(out_dir, "report",
                                                     "*.R*"))
                if "summary" not in data:
                    data["summary"] = {}
                data["summary"]["multiqc"] = {
                    "base": out_file,
                    "secondary": data_files
                }
        out.append(data)
    return [[d] for d in out]
Beispiel #38
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_jointcaller(d):
            vcinfo = variant.extract_population_vcinfo(d)
        elif dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Beispiel #39
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0])
    work_samples = _summarize_inputs(work_samples, out_dir)
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))]
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final)

        # Prepare final file list and inputs for downstream usage
        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final)
            if any([cwlutils.is_cwl_run(d) for d in samples]):
                for indir in ["inputs", "report"]:
                    tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir))
                    if not utils.file_exists(tarball):
                        cmd = ["tar", "-czvpf", tarball, os.path.join(out_dir, indir)]
                        do.run(cmd, "Compress multiqc inputs: %s" % indir)
                    samples[0]["summary"]["multiqc"]["secondary"].append(tarball)

    if any([cwlutils.is_cwl_run(d) for d in samples]):
        samples = _add_versions(samples)

    return [[data] for data in samples]
Beispiel #40
0
 def run(self, subcmd, opts, memscale=None):
     jvm_opts = get_picard_opts(self._config, memscale=memscale)
     cmd = ["export", "PATH=%s:\"$PATH\"" % utils.get_java_binpath(), "&&"] + \
           [self._cmd] + jvm_opts + [subcmd] + ["%s=%s" % (x, y) for x, y in opts] + \
           ["VALIDATION_STRINGENCY=SILENT"]
     do.run(utils.clear_java_home() + " && " + " ".join(cmd), "Picard: %s" % subcmd)
Beispiel #41
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region,
                                                   out_file, do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(tx_out_file, config,
                                         samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region, out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                                      "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                                      """| %s -c 'from bcbio.variation import freebayes; """
                                      """freebayes.call_somatic("%s", "%s")' """
                                      % (sys.executable, paired.tumor_name, paired.normal_name))
                    freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                                   "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" %
                                   (os.path.join(os.path.dirname(sys.executable), "py"),
                                     0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                       "| {strandbias} "
                       "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                       "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                       """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """
                       "{freq_filter} "
                       "| bcftools filter -i 'QUAL >= 0' "
                       "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    return out_file
Beispiel #42
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        return [[data]]

    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(
            data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" %
                 dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(
                data.get('resources', {}).get('cutadapt',
                                              {}).get("options", "")):
            raise ValueError(
                "Atropos is now used, but cutadapt options found in YAML file."
                "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()),
                           "cutadapt with this %s for %s" % (options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #43
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(
        work_dir, "%s%s-prep.vcf.gz" %
        (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(
                _prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                     os.path.dirname(tx_out_file)))
            std_excludes = [
                "~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"
            ]

            def _is_std_exclude(n):
                clean_excludes = [
                    x.replace("~", "").replace("^", "") for x in std_excludes
                ]
                return any(
                    [n.startswith(x) or n.endswith(x) for x in clean_excludes])

            exclude_chrs = [
                c.name for c in ref.file_contigs(ref_file)
                if not chromhacks.is_nonalt(c.name)
                and not _is_std_exclude(c.name)
            ]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes +
                                                             exclude_chrs)
            exclude_bed = (
                "--exclude %s" %
                sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = (
                "export TMPDIR={tempdir} && "
                "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                "--name {name} --outdir {out_dir} "
                "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling",
                           items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(str(msg)):
                        vcfutils.write_empty_vcf(
                            tx_out_file,
                            config=items[0]["config"],
                            samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Beispiel #44
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(
                data["genome_build"],
                tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data,
                                  1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"],
                                     data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {
                        "loftee": _get_loftee,
                        "maxentscan": _get_maxentscan,
                        "genesplicer": _get_genesplicer,
                        "spliceregion": _get_spliceregion
                    }
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                        or tz.get_in(
                            ("config", "algorithm", "clinical_reporting"),
                            data)):
                    config_args += ["--pick"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_exac",
                       "--pubmed", "--variant_class"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (
                    perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #45
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(
                vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(
                    d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info(
            "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
            "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        sites_str = "--sites hg38" if dd.get_genome_build(
            data) == "hg38" else ""
        cmd = (
            "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
            "{vcf_file} {ped_file} 2> {stderr_log}")
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)

            def allowed_errors(l):
                return ((l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0))

            def all_line_errors(l):
                return (l.find("no intervals found for") >= 0)

            if any([allowed_errors(l) for l in to_show]) or all(
                [all_line_errors(l) for l in to_show]):
                logger.info(
                    "Skipping peddy because no variants overlap with checks: %s"
                    % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write(
                        "peddy did not find overlaps with 1kg sites in VCF, skipping"
                    )
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Beispiel #46
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(
                vrs, region, out_file, items=items, do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target)
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| {strandbias}"
                       "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                       """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """
                       "| bcftools filter -i 'QUAL >= 0' "
                       "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    return out_file
Beispiel #47
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if dd.get_avg_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Beispiel #48
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(
            vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), (
            "Did not find rtg indexed reference file for validation:\n%s\n"
            "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(
            resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {
                "algorithm": {
                    "memory_adjust": {
                        "magnitude": threads,
                        "direction": "increase"
                    }
                }
            })
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = [
            "rtg", "vcfeval", "--threads",
            str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c",
            vrn_file, "-t", rtg_ref, "-o", out_dir
        ]
        rm_samples = vcfutils.get_samples(rm_file)
        if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples:
            cmd += ["--sample=%s" % dd.get_sample_name(data)]
        cmd += [
            "--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))
        ]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (
            utils.local_path_export(), jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {
        "fp": os.path.join(out_dir, "fp.vcf.gz"),
        "fn": os.path.join(out_dir, "fn.vcf.gz")
    }
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Beispiel #49
0
def consensus(peakfiles, consensusfile, data):
    """call consensus peaks from a set of narrow/broad peakfiles
    we use this method:
    https://bedops.readthedocs.io/en/latest/content/usage-examples/master-list.html
    """
    if utils.file_exists(consensusfile):
        return consensusfile

    try:
        bedops = config_utils.get_program("bedops", data)
    except config_utils.CmdNotFound:
        logger.info("bedops not found, skipping consensus peak calling. do a "
                    "--tools update to install bedops.")
        return None
    try:
        sortbed = config_utils.get_program("sort-bed", data)
    except config_utils.CmdNotFound:
        logger.info("sort-bed not found, skipping consensus peak calling. do "
                    "--tools update to install sort-bed.")
        return None
    try:
        bedmap = config_utils.get_program("bedmap", data)
    except config_utils.CmdNotFound:
        logger.info("bedmap not found, skipping consensus peak calling. do a "
                    "--tools update to install bedmap.")
        return None

    logger.info(f"Calling consensus peaks on {','.join(peakfiles)}")
    logger.info(f"Removing low quality peaks from {','.join(peakfiles)}")
    filteredpeaks = []
    for fn in peakfiles:
        filteredpeak = NamedTemporaryFile(suffix=".bed", delete=False).name
        df = remove_low_quality_peaks(fn, qval=0.05)
        df.to_csv(filteredpeak, index=False, header=False, sep="\t")
        filteredpeaks.append(filteredpeak)
    peakfiles = filteredpeaks

    with file_transaction(consensusfile) as tx_consensus_file:
        with utils.tmpfile(suffix=".bed") as tmpbed:
            message = f"Move all peaks in {' '.join(peakfiles)} to a single file."
            mergepeakscmd = f"{bedops} -u {' '.join(peakfiles)} > {tmpbed}"
            do.run(mergepeakscmd, message)
            iteration = 0
            while os.path.getsize(tmpbed):
                iteration = iteration + 1
                iterationbed = NamedTemporaryFile(suffix=".bed",
                                                  delete=False).name
                with utils.tmpfile(suffix="bed") as mergedbed, \
                     utils.tmpfile(suffix="bed") as intermediatebed, \
                     utils.tmpfile(suffix="bed") as leftoverbed, \
                     utils.tmpfile(suffix="bed") as tmpsolutionbed:
                    mergecmd = (f"{bedops} -m --range 0:-1 {tmpbed} | "
                                f"{bedops} -u --range 0:1 - > "
                                f"{mergedbed}")
                    message = f"Merging non-overlapping peaks, iteration {iteration}."
                    do.run(mergecmd, message)
                    nitems = len(open(mergedbed).readlines())
                    message = f"Considering {nitems} peaks, choosing the highest score for overlapping peaks."
                    highscorecmd = (
                        f"{bedmap} --max-element {mergedbed} {tmpbed} |"
                        f"{sortbed} - > "
                        f"{iterationbed}")
                    do.run(highscorecmd, message)
                    message = f"Checking if there are peaks left to merge."
                    anyleftcmd = (
                        f"{bedops} -n 1 {tmpbed} {iterationbed} > {intermediatebed}"
                    )
                    do.run(anyleftcmd, message)
                    shutil.move(intermediatebed, tmpbed)
                    nitems = len(open(iterationbed).readlines())
                    message = f"Adding {nitems} peaks to consensus peaks."
                    if utils.file_exists(tx_consensus_file):
                        consensuscmd = (
                            f"{bedops} -u {tx_consensus_file} {iterationbed} > {tmpsolutionbed}"
                        )
                        do.run(consensuscmd, message)
                        shutil.move(tmpsolutionbed, tx_consensus_file)
                    else:
                        shutil.move(iterationbed, tx_consensus_file)
    return consensusfile
Beispiel #50
0
def _run_freebayes_paired(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None):
    """Detect SNPs and indels with FreeBayes for paired tumor/normal samples.

    Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering"

            freebayes = config_utils.get_program("freebayes", config)
            opts, no_target_regions = _freebayes_options_from_config(
                items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                opts = " ".join(opts)
                opts += " --min-repeat-entropy 1"
                opts += " --no-partial-observations"
                opts = _add_somatic_opts(opts, paired)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                # For multi-sample outputs, ensure consistent order
                samples = (
                    "-s " +
                    ",".join([dd.get_sample_name(d)
                              for d in items])) if len(items) > 1 else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                bcbio_py = sys.executable
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cl = (
                    "{freebayes} -f {ref_file} {opts} "
                    "{paired.tumor_bam} {paired.normal_bam} "
                    """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                    """| {bcbio_py} -c 'from bcbio.variation import freebayes; """
                    """freebayes.call_somatic("{paired.tumor_name}", "{paired.normal_name}")' """
                    "| {fix_ambig} | bcftools view {samples} -a - | "
                    "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                    "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                    "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Beispiel #51
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4-len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if not transform:
        logger.info("No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1)
            data["files"] = [fq1]
            return [[data]]
        else:
            logger.error("No UMI transform was specified, but %s does not look "
                         "pre-transformed." % fq1)
            sys.exit(1)

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s."
                %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Beispiel #52
0
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''):
    """Create bgzipped fastq files from an input BAM file.
    """
    # tools
    config = data["config"]
    bamtofastq = config_utils.get_program("bamtofastq", config)
    resources = config_utils.get_resources("bamtofastq", config)
    cores = config["algorithm"].get("num_cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory",
                                                          "1G")) * cores
    bgzip = tools.get_bgzip_cmd(config, is_retry)
    # files
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_file_1 = os.path.join(
        work_dir, "%s%s-1.fq.gz" %
        (os.path.splitext(os.path.basename(bam_file))[0], output_infix))
    out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz")
    needs_retry = False
    if is_retry or not utils.file_exists(out_file_1):
        if not bam.is_paired(bam_file):
            out_file_2 = None
        with file_transaction(config, out_file_1) as tx_out_file:
            for f in [tx_out_file, out_file_1, out_file_2]:
                if f and os.path.exists(f):
                    os.remove(f)
            fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file)
            prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0)
            if prep_cmd:
                fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd
            sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0]
            if bam.is_paired(bam_file):
                prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1)
                fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2)
                if prep_cmd:
                    fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd
                out_str = (
                    "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null "
                    "O2=/dev/null collate=1 colsbs={max_mem}")
            else:
                out_str = "S=>({fq1_bgzip_cmd})"
            bam_file = objectstore.cl_input(bam_file)
            extra_opts = " ".join(
                [str(x) for x in resources.get("options", [])])
            cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str
            try:
                do.run(cmd.format(**locals()),
                       "BAM to bgzipped fastq",
                       checks=[do.file_reasonable_size(tx_out_file, bam_file)],
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if not is_retry and "deflate failed" in str(msg):
                    logger.info(
                        "bamtofastq deflate IO failure preparing %s. Retrying with single core."
                        % (bam_file))
                    needs_retry = True
                else:
                    logger.exception()
                    raise
    if needs_retry:
        return _bgzip_from_bam(bam_file, dirs, data, is_retry=True)
    else:
        return [
            x for x in [out_file_1, out_file_2]
            if x is not None and utils.file_exists(x)
        ]
Beispiel #53
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f
    fastq_files = (" ".join([_unpack_fastq(fastq_file), _unpack_fastq(pair_file)])
                   if pair_file else _unpack_fastq(fastq_file))
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
               "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
               "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
               "--outStd BAM_Unsorted {srna_opts} "
               "--limitOutSJcollapsed 2000000 "
               "--outSAMtype BAM Unsorted "
               "--outSAMmapqUnique 60 "
               "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                "--chimScoreSeparation 5 ")
            if "oncofuse" in dd.get_fusion_caller(data):
                cmd += "--chimOutType Junctions "
            else:
                cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data
Beispiel #54
0
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file):
    """Run a paired VarScan analysis, also known as "somatic". """

    max_read_depth = "1000"
    config = items[0]["config"]

    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.5":
        raise IOError(
            "Please install version 2.3.5 or better of VarScan with support "
            "for multisample calling and indels in VCF format.")
    varscan_jar = config_utils.get_jar(
        "VarScan", config_utils.get_program("varscan", config, "dir"))

    remove_zerocoverage = "grep -v -P '\t0\t\t$'"

    # No need for names in VarScan, hence the "_"

    tumor_bam, _, normal_bam, _ = get_paired_bams(align_bams, items)

    if not file_exists(out_file):
        base, ext = os.path.splitext(out_file)
        cleanup_files = []
        for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]:
            mpfile = "%s-%s.mpileup" % (base, mpext)
            cleanup_files.append(mpfile)
            with file_transaction(mpfile) as mpfile_tx:
                mpileup = samtools.prep_mpileup([fname],
                                                ref_file,
                                                max_read_depth,
                                                config,
                                                target_regions=target_regions,
                                                want_bcf=False)
                cmd = "{mpileup} > {mpfile_tx}"
                cmd = cmd.format(**locals())
                do.run(cmd, "samtools mpileup".format(**locals()), None,
                       [do.file_exists(mpfile_tx)])

        # Sometimes mpileup writes an empty file: in this case we
        # just skip the rest of the analysis (VarScan will hang otherwise)

        if any(os.stat(filename).st_size == 0 for filename in cleanup_files):
            write_empty_vcf(out_file)
            return

        # First index is normal, second is tumor
        normal_tmp_mpileup = cleanup_files[0]
        tumor_tmp_mpileup = cleanup_files[1]

        jvm_opts = _get_varscan_opts(config)
        varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic"
                       " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}"
                       " --output-vcf --min-coverage 5 --p-value 0.98")

        indel_file = base + ".indel.vcf"
        snp_file = base + ".snp.vcf"

        cleanup_files.append(indel_file)
        cleanup_files.append(snp_file)

        to_combine = []

        with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp):
            varscan_cmd = varscan_cmd.format(**locals())
            do.run(varscan_cmd, "Varscan".format(**locals()), None, None)

        if do.file_exists(snp_file):
            to_combine.append(snp_file)

        if do.file_exists(indel_file):
            to_combine.append(indel_file)

        if not to_combine:
            write_empty_vcf(out_file)
            return

        out_file = combine_variant_files([snp_file, indel_file],
                                         out_file,
                                         ref_file,
                                         config,
                                         region=target_regions)

        # Remove cleanup files

        for extra_file in cleanup_files:
            os.remove(extra_file)

        if os.path.getsize(out_file) == 0:
            write_empty_vcf(out_file)

        _fix_varscan_vcf(out_file, align_bams)
Beispiel #55
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args

                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):

                    # In case of clinical reporting, we need one and only one
                    # variant per gene
                    # From the VEP docs:
                    # "Pick once line of consequence data per variant,
                    # including transcript-specific columns. Consequences are
                    # chosen by the canonical, biotype status and length of the
                    # transcript, along with the ranking of the consequence
                    # type according to this table. This is the best method to
                    # use if you are interested only in one consequence per
                    #  variant.

                    cmd += ["--pick"]

                    # TODO investigate hgvs reporting but requires indexing the reference file
                    # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)]
                perllib = "export PERL5LIB=%s:$PERL5LIB" % _get_perllib()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perllib, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Beispiel #56
0
def mutect2_caller(align_bams,
                   items,
                   ref_file,
                   assoc_files,
                   region=None,
                   out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = [
                "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                "--annotation", "ClippingRankSumTest", "--annotation",
                "DepthPerSampleHC"
            ]
            if gatk_type == "gatk4":
                params += ["--reference", ref_file]
            else:
                params += ["-R", ref_file]
            for a in annotation.get_gatk_annotations(
                    items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            resources = config_utils.get_resources("mutect2",
                                                   items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params,
                                            os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(
                    out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(
                    tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file,
                                             tx_raw_file, ref_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Beispiel #57
0
def _run_scalpel_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports()
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(
                    _scalpel_options_from_config(items, config, out_file,
                                                 region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = (
                    "{perl_exports} && "
                    "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}"
                )
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(
                _scalpel_bed_file_opts(items, config, out_file, region,
                                       tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path,
                                                "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(
                    tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config,
                                      scalpel_tmp_file) as tx_indel_file:
                    cmd = (
                        "{perl_exports} && "
                        "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                        "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                        "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()),
                           "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(
                os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression(
                "reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = (
                "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}"
            )
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Beispiel #58
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compress_and_sort_bdg_files(out_dir, data)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    antibody = dd.get_antibody(data)
    if antibody:
        antibody = antibody.lower()
        if antibody not in antibodies.SUPPORTED_ANTIBODIES:
            logger.error(
                f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody "
                f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody."
                f"It will run 'narrow' if the antibody is not supported.")
            antibody = 'narrow'
        antibody = antibodies.ANTIBODIES[antibody]
        logger.info(
            f"{antibody.name} specified, using {antibody.peaktype} peak settings."
        )
        peaksettings = select_peak_parameters(antibody)
    elif method == "atac":
        logger.info(f"ATAC-seq specified, using narrow peak settings.")
        peaksettings = " "
    else:
        peaksettings = " "
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    chip_reads = sum([x.aligned for x in bam.idxstats(chip_bam, data)])
    if chip_reads == 0:
        logger.error(
            f"{chip_bam} has 0 reads. Please remove the sample and re-run")
        raise RuntimeWarning(
            f"macs2 terminated - no reads in {chip_bam}. Please remove the sample and re-run"
        )
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(data)
        cmd += peaksettings
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error. "
                "Please, check the message and report "
                "error if it is related to bcbio. "
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compress_and_sort_bdg_files(out_dir, data)
    return _get_output_files(out_dir)
Beispiel #59
0
def _run_purple(paired, het_file, depth_file, vrn_files, work_dir):
    """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs.
    """
    purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple"))
    out_file = os.path.join(
        purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-amber", os.path.dirname(het_file), "-baf", het_file,
                   "-cobalt", os.path.dirname(depth_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"],
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19",
                   "-run_dir", work_dir,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor_sample", dd.get_sample_name(paired.tumor_data),
                   "-ref_sample", dd.get_sample_name(paired.normal_data)]
            if vrn_files:
                cmd += ["-somatic_vcf", vrn_files[0]["vrn_file"]]
            # Avoid X11 display errors when writing plots
            cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd])
            do.run(cmd, "PURPLE: purity and ploidy estimation")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(purple_dir, f))
    out_file_export = os.path.join(
        purple_dir,
        "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data)))
    if not utils.file_exists(out_file_export):
        utils.symlink_plus(out_file, out_file_export)
    out = {
        "variantcaller":
        "purple",
        "call_file":
        out_file_export,
        "vrn_file":
        titancna.to_vcf(out_file_export, "PURPLE", _get_header, _export_to_vcf,
                        paired.tumor_data),
        "plot": {},
        "metrics": {}
    }
    for name, ext in [("copy_number", "copyNumber"),
                      ("minor_allele", "minor_allele"),
                      ("variant", "variant")]:
        plot_file = os.path.join(
            purple_dir, "plot",
            "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext))
        if os.path.exists(plot_file):
            out["plot"][name] = plot_file
    purity_file = os.path.join(
        purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data))
    with open(purity_file) as in_handle:
        header = in_handle.readline().replace("#", "").split("\t")
        vals = in_handle.readline().split("\t")
        for h, v in zip(header, vals):
            try:
                v = float(v)
            except ValueError:
                pass
            out["metrics"][h] = v
    return out
Beispiel #60
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data
    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
           "--outStd SAM {srna_opts} "
           "--outSAMunmapped Within --outSAMattributes %s " %
           " ".join(ALIGN_TAGS))
    cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file)
    cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"),
                               False)
    if fusion_mode:
        cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                "--chimScoreSeparation 5 "
                "--chimOutType WithinSAM ")
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded" and not srna:
        cmd += " --outSAMstrandField intronMotif "

    if not srna:
        cmd += " --quantMode TranscriptomeSAM "

    with file_transaction(data, final_out) as tx_final_out:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data