Exemple #1
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Exemple #2
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers."
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}"
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #3
0
def _create_combined_fasta(data, out_dir):
    """
    if there are genomes to be disambiguated, create a FASTA file of
    all of the transcripts for all genomes
    """
    items = disambiguate.split([data])
    fasta_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        ref_file = dd.get_ref_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa")
        if file_exists(out_file):
            fasta_files.append(out_file)
        else:
            out_file = _gtf_to_fasta(gtf_file, ref_file, out_file)
            out_file = _clean_gtf_fa(out_file, out_file)
            fasta_files.append(out_file)
    out_stem = os.path.join(out_dir, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    combined_file = out_stem + ".fa"
    if file_exists(combined_file):
        return combined_file

    fasta_file_string = " ".join(fasta_files)
    cmd = "cat {fasta_file_string} > {tx_out_file}"
    with file_transaction(combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.")
    return combined_file
Exemple #4
0
def run_vcfanno(vcf, anno_type, data):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    UNSUPPORTED_TYPE_MESSAGE = (
        "{anno_type} is not a supported vcf annotation type with vcfanno. "
        "Supported types are {SUPPORTED_ANNOTATION_TYPES}")
    if anno_type not in SUPPORTED_ANNOTATION_TYPES:
        logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals()))
        return vcf
    build = dd.get_genome_build(data)
    annodir = os.path.dirname(dd.get_ref_file(data))
    annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno"))
    annostem = os.path.join(annodir, build + "-")
    conffn = annostem + anno_type + ".conf"
    luafn = annostem + anno_type + ".lua"
    CONF_NOT_FOUND = (
        "The vcfanno configuration {conffn} was not found for {build}, skipping.")
    if not utils.file_exists(conffn):
        logger.warn(CONF_NOT_FOUND.format(**locals()))
        return vcf

    base = os.path.splitext(vcf)[0]
    out_file = base + anno_type + "-annotated.vcf.gz"
    if utils.file_exists(out_file):
        return out_file
    basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                            os.path.pardir))
    basepath = annodir

    out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn)
    return out_file
def ref_file_from_bam(bam_file, data):
    """Subset a fasta input file to only a fraction of input contigs.
    """
    new_ref = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")),
                           "%s-subset.fa" % dd.get_genome_build(data))
    if not utils.file_exists(new_ref):
        with file_transaction(data, new_ref) as tx_out_file:
            contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0]
            with open(contig_file, "w") as out_handle:
                for contig in [x.contig for x in idxstats(bam_file, data) if x.contig != "*"]:
                    out_handle.write("%s\n" % contig)
            cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file)
            do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data))
    ref.fasta_idx(new_ref, data["config"])
    runner = broad.runner_from_path("picard", data["config"])
    runner.run_fn("picard_index_ref", new_ref)
    return {"base": new_ref}
def _get_input_args(bam_file, data, out_base, background):
    """Retrieve input args, depending on genome build.

    VerifyBamID2 only handles GRCh37 (1, 2, 3) not hg19, so need to generate
    a pileup for hg19 and fix chromosome naming.
    """
    if dd.get_genome_build(data) in ["hg19"]:
        return ["--PileupFile", _create_pileup(bam_file, data, out_base, background)]
    else:
        return ["--BamFile", bam_file]
Exemple #7
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    input_bam = data["work_bam_input"]
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name ))
    out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, data["config"])
    data["peaks_file"] = out_file
    return [[data]]
Exemple #8
0
def _run_purecn(paired, work_dir):
    """Run PureCN.R wrapper with pre-segmented CNVkit or GATK4 inputs.
    """
    segfns = {"cnvkit": _segment_normalized_cnvkit, "gatk-cnv": _segment_normalized_gatk}
    out_base, out, all_files = _get_purecn_files(paired, work_dir)
    failed_file = out_base + "-failed.log"
    cnr_file = tz.get_in(["depth", "bins", "normalized"], paired.tumor_data)
    if not utils.file_uptodate(out["rds"], cnr_file) and not utils.file_exists(failed_file):
        cnr_file, seg_file = segfns[cnvkit.bin_approach(paired.tumor_data)](cnr_file, work_dir, paired)
        from bcbio import heterogeneity
        vcf_file = heterogeneity.get_variants(paired.tumor_data, include_germline=False)[0]["vrn_file"]
        vcf_file = germline.filter_to_pass_and_reject(vcf_file, paired, out_dir=work_dir)
        with file_transaction(paired.tumor_data, out_base) as tx_out_base:
            # Use UCSC style naming for human builds to support BSgenome
            genome = ("hg19" if dd.get_genome_build(paired.tumor_data) in ["GRCh37", "hg19"]
                      else dd.get_genome_build(paired.tumor_data))
            cmd = ["PureCN.R", "--seed", "42", "--out", tx_out_base, "--rds", "%s.rds" % tx_out_base,
                   "--sampleid", dd.get_sample_name(paired.tumor_data),
                   "--genome", genome,
                   "--vcf", vcf_file, "--tumor", cnr_file,
                   "--segfile", seg_file, "--funsegmentation", "Hclust", "--maxnonclonal", "0.3"]
            if dd.get_num_cores(paired.tumor_data) > 1:
                cmd += ["--cores", str(dd.get_num_cores(paired.tumor_data))]
            try:
                cmd = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(), utils.get_R_exports(),
                                                             " ".join([str(x) for x in cmd]))
                do.run(cmd, "PureCN copy number calling")
            except subprocess.CalledProcessError as msg:
                if _allowed_errors(str(msg)):
                    logger.info("PureCN failed to find solution for %s: skipping" %
                                dd.get_sample_name(paired.tumor_data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.exception()
                    raise
            for f in all_files:
                if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)):
                    shutil.move(os.path.join(os.path.dirname(tx_out_base), f),
                                os.path.join(os.path.dirname(out_base), f))
    out = _get_purecn_files(paired, work_dir, require_exist=True)[1]
    return out if (out.get("rds") and os.path.exists(out["rds"])) else None
def get_genome(data):
    """
    get the effective length of the genome, falling back to the length of the genome
    if the effective length is not precomputed
    """
    from bcbio.chipseq import macs2
    from bcbio.bam import fasta
    genome = dd.get_genome_build(data)
    loaded = macs2.HS
    if genome in loaded:
        return loaded[genome]
    else:
        return sum([x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
Exemple #10
0
def rapmap_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    rapmap = config_utils.get_program("rapmap", data["config"])
    gtf_fa = create_combined_fasta(data, out_dir)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} pseudoindex -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating RapMap pseudoindex for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
def is_human(data, builds=None):
    """Check if human, optionally with build number, search by name or extra GL contigs.
    """
    def has_build37_contigs(data):
        for contig in ref.file_contigs(dd.get_ref_file(data)):
            if contig.name.startswith("GL") or contig.name.find("_gl") >= 0:
                if contig.name in naming.GMAP["hg19"] or contig.name in naming.GMAP["GRCh37"]:
                    return True
        return False
    if not builds and tz.get_in(["genome_resources", "aliases", "human"], data):
        return True
    if not builds or "37" in builds:
        target_builds = ["hg19", "GRCh37"]
        if any([dd.get_genome_build(data).startswith(b) for b in target_builds]):
            return True
        elif has_build37_contigs(data):
            return True
    if not builds or "38" in builds:
        target_builds = ["hg38"]
        if any([dd.get_genome_build(data).startswith(b) for b in target_builds]):
            return True
    return False
Exemple #12
0
def is_human(data, builds=None):
    """Check if human, optionally with build number, search by name or extra GL contigs.
    """
    def has_build37_contigs(data):
        for contig in ref.file_contigs(dd.get_ref_file(data)):
            if contig.name.startswith("GL") or contig.name.find("_gl") >= 0:
                if contig.name in naming.GMAP["hg19"] or contig.name in naming.GMAP["GRCh37"]:
                    return True
        return False
    if not builds and tz.get_in(["genome_resources", "aliases", "human"], data):
        return True
    if not builds or "37" in builds:
        target_builds = ["hg19", "GRCh37"]
        if dd.get_genome_build(data) in target_builds:
            return True
        elif has_build37_contigs(data):
            return True
    if not builds or "38" in builds:
        target_builds = ["hg38"]
        if dd.get_genome_build(data) in target_builds:
            return True
    return False
Exemple #13
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name ))
    # chip_bam = _prepare_bam(chip_bam, dd.get_variant_regions(data), data['config'])
    # input_bam = _prepare_bam(input_bam, dd.get_variant_regions(data), data['config'])
    out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                         dd.get_chip_method(data), data["config"])
    data["peaks_file"] = out_file
    return [[data]]
Exemple #14
0
def run_arriba(data):
    build = dd.get_genome_build(data)
    if build not in SUPPORTED_BUILDS:
        logger.info(f"{build} not supported for arriba, skipping.")
        return data

    arriba_dir = os.path.join(dd.get_work_dir(data), "arriba",
                              dd.get_sample_name(data))
    utils.safe_makedir(arriba_dir)
    bam_file = dd.get_work_bam(data)
    ref_file = dd.get_ref_file(data)
    gtf = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    arriba = config_utils.get_program("arriba", data)
    fusion_file = os.path.join(arriba_dir, "fusions.tsv")
    discarded_fusion_file = os.path.join(arriba_dir, "fusions.discarded.tsv")
    blacklist_file = get_arriba_blacklist_file(data)
    contigs = get_contigs(data)
    contig_list = ",".join(contigs)
    if utils.file_exists(fusion_file):
        data["arriba"] = {
            "fusions": fusion_file,
            "discarded": discarded_fusion_file
        }
        return (data)

    with file_transaction(fusion_file) as tx_fusion_file, \
         file_transaction(discarded_fusion_file) as tx_discarded_fusion_file:
        cmd = (
            f"{arriba} -x {bam_file} -g {gtf} -a {ref_file} -o {tx_fusion_file} "
            f"-O {tx_discarded_fusion_file} -T -P "
            f"-i {contig_list} ")
        if blacklist_file:
            logger.info(
                f"arriba blacklist file found, running blacklisting with {blacklist_file}."
            )
            cmd += (f"-b {blacklist_file} ")
        else:
            logger.info(
                "arriba blacklist file not found, disabling blacklist filtering."
            )
            cmd += (f"-f blacklist ")
        if dd.get_known_fusions(data):
            cmd += (f"-k {dd.get_known_fusions(data)} ")
        message = f"Running arriba on {dd.get_sample_name(data)}."
        do.run(cmd, message)

    data["arriba"] = {
        "fusions": fusion_file,
        "discarded": discarded_fusion_file
    }
    return (data)
Exemple #15
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating rapmap {index_type} for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #16
0
def sailfish_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    sailfish = config_utils.get_program("sailfish", data["config"])
    num_cores = dd.get_num_cores(data)
    gtf_fa = create_combined_fasta(data, out_dir)
    if file_exists(out_dir + "versionInfo.json"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} -k 25"
        message = "Creating sailfish index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #17
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
    # chip_bam = _prepare_bam(chip_bam, dd.get_variant_regions(data), data['config'])
    # input_bam = _prepare_bam(input_bam, dd.get_variant_regions(data), data['config'])
    out_file = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data),
                         out_dir, dd.get_chip_method(data), data["config"])
    data["peaks_file"] = out_file
    return [[data]]
Exemple #18
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    gtf_fa = sailfish._create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    ### TODO PUT MEMOZATION HERE
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #19
0
def rapmap_pseudoindex(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "pseudoindex", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    gtf_fa = sailfish._create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} pseudoindex -k 31 -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating rapmap pseudoindex for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #20
0
def get_genome(data):
    """
    get the effective length of the genome, falling back to the length of the genome
    if the effective length is not precomputed
    """
    from bcbio.chipseq import macs2
    from bcbio.bam import fasta
    genome = dd.get_genome_build(data)
    loaded = macs2.HS
    if genome in loaded:
        return loaded[genome]
    else:
        return sum(
            [x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
Exemple #21
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    gtf_fa = sailfish._create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    ### TODO PUT MEMOZATION HERE
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #22
0
def quantitate(data):
    """CWL target for quantitation.

    XXX Needs to be split and parallelized by expression caller, with merging
    of multiple calls.
    """
    data = to_single_data(to_single_data(data))
    data = generate_transcript_counts(data)[0][0]
    data["quant"] = {}
    if "sailfish" in dd.get_expression_caller(data):
        data = to_single_data(sailfish.run_sailfish(data)[0])
        data["quant"]["tsv"] = data["sailfish"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]),
                                             "abundance.h5")
    if ("kallisto" in dd.get_expression_caller(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0])
        data["quant"]["tsv"] = os.path.join(data["kallisto_quant"],
                                            "abundance.tsv")
        data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"],
                                             "abundance.h5")
    if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))):
        data["quant"]["fusion"] = os.path.join(data["kallisto_quant"],
                                               "fusion.txt")
    else:
        data["quant"]["fusion"] = None
    if "salmon" in dd.get_expression_caller(data):
        if dd.get_quantify_genome_alignments(data):
            if dd.get_aligner(data).lower() != "star":
                if dd.get_genome_build(data) == "hg38":
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Since this is hg38 we will fall "
                        "back to the decoy method")
                    data = to_single_data(salmon.run_salmon_decoy(data)[0])
                else:
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Falling back to the "
                        "transcriptome-only method.")
                    data = to_single_data(salmon.run_salmon_reads(data)[0])
            else:
                data = to_single_data(salmon.run_salmon_bam(data)[0])
        else:
            data = to_single_data(salmon.run_salmon_reads(data)[0])
        data["quant"]["tsv"] = data["salmon"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]),
                                             "abundance.h5")
    return [[data]]
def ref_file_from_bam(bam_file, data):
    """Subset a fasta input file to only a fraction of input contigs.
    """
    new_ref = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs",
                                        "ref")),
        "%s-subset.fa" % dd.get_genome_build(data))
    if not utils.file_exists(new_ref):
        with file_transaction(data, new_ref) as tx_out_file:
            contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0]
            with open(contig_file, "w") as out_handle:
                for contig in [
                        x.contig for x in idxstats(bam_file, data)
                        if x.contig != "*"
                ]:
                    out_handle.write("%s\n" % contig)
            cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data),
                                                      contig_file, tx_out_file)
            do.run(cmd,
                   "Subset %s to BAM file contigs" % dd.get_genome_build(data))
    ref.fasta_idx(new_ref, data["config"])
    runner = broad.runner_from_path("picard", data["config"])
    runner.run_fn("picard_index_ref", new_ref)
    return {"base": new_ref}
Exemple #24
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = data.get("work_bam")
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                          dd.get_chip_method(data), data["resources"], data)
    greylistdir = greylisting(data)
    data.update({"peaks_files": out_files})
    # data["input_bam_filter"] = input_bam
    if greylistdir:
        data["greylist"] = greylistdir
    return [[data]]
Exemple #25
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    chip_bam = _prepare_bam(chip_bam, encode_bed, data['config'])
    input_bam = _prepare_bam(input_bam, encode_bed, data['config'])
    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                          dd.get_chip_method(data), data["config"])
    data.update({"peaks_files": out_files})
    return [[data]]
Exemple #26
0
def sailfish_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    sailfish = config_utils.get_program("sailfish", data["config"])
    num_cores = dd.get_num_cores(data)
    gtf_fa = _create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "versionInfo.json"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} -k 25"
        message = "Creating sailfish index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #27
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = data.get("work_bam")
    input_bam = data.get("work_bam_input", None)
    caller_fn = get_callers()[data["peak_fn"]]
    name = dd.get_sample_name(data)
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
                          dd.get_chip_method(data), data["resources"], data)
    greylistdir = greylisting(data)
    data.update({"peaks_files": out_files})
    # data["input_bam_filter"] = input_bam
    if greylistdir:
        data["greylist"] = greylistdir
    return [[data]]
Exemple #28
0
def _run_purple(paired, het_file, depth_file, vrn_files, work_dir):
    """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs.
    """
    purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple"))
    out_file = os.path.join(purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-amber", os.path.dirname(het_file), "-baf", het_file,
                   "-cobalt", os.path.dirname(depth_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"],
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19",
                   "-run_dir", work_dir,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor_sample", dd.get_sample_name(paired.tumor_data),
                   "-ref_sample", dd.get_sample_name(paired.normal_data)]
            if vrn_files:
                cmd += ["-somatic_vcf", vrn_files[0]["vrn_file"]]
            # Avoid X11 display errors when writing plots
            cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd])
            do.run(cmd, "PURPLE: purity and ploidy estimation")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(purple_dir, f))
    out_file_export = os.path.join(purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data)))
    if not utils.file_exists(out_file_export):
        utils.symlink_plus(out_file, out_file_export)
    out = {"variantcaller": "purple", "call_file": out_file_export,
           "vrn_file": titancna.to_vcf(out_file_export, "PURPLE", _get_header, _export_to_vcf,
                                       paired.tumor_data),
           "plot": {}, "metrics": {}}
    for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]:
        plot_file = os.path.join(purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext))
        if os.path.exists(plot_file):
            out["plot"][name] = plot_file
    purity_file = os.path.join(purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data))
    with open(purity_file) as in_handle:
        header = in_handle.readline().replace("#", "").split("\t")
        vals = in_handle.readline().split("\t")
        for h, v in zip(header, vals):
            try:
                v = float(v)
            except ValueError:
                pass
            out["metrics"][h] = v
    return out
Exemple #29
0
def find_annotations(data, retriever=None):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data, retriever):
        if c not in conf_files:
            conf_files.append(c)
    conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic}
    out = []
    annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))
    if not retriever:
        annodir = os.path.abspath(annodir)
    for conf_file in conf_files:
        if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)):
            conffn = conf_file
        elif not retriever:
            conffn = os.path.join(annodir, conf_file + ".conf")
        else:
            conffn = conf_file + ".conf"
        luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        if retriever:
            conffn, luafn = [(x if objectstore.is_remote(x) else None)
                             for x in retriever.add_remotes([conffn, luafn], data["config"])]
        if not conffn:
            pass
        elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever):
            logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file)
        elif not objectstore.file_exists_or_remote(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            if luafn and objectstore.file_exists_or_remote(luafn):
                out.append(luafn)
    return out
Exemple #30
0
def _sample_template(sample, out_dir):
    """R code to get QC for one sample"""
    bam_fn = dd.get_work_bam(sample)
    genome = dd.get_genome_build(sample)
    if genome in supported:
        peaks = sample.get("peaks_files", []).get("main")
        if peaks:
            r_code = ("library(ChIPQC);\n"
                      "sample = ChIPQCsample(\"{bam_fn}\","
                      "\"{peaks}\", "
                      "annotation = \"{genome}\","
                      ");\n"
                      "ChIPQCreport(sample);\n")
            r_code_fn = os.path.join(out_dir, "chipqc.r")
            with open(r_code_fn, 'w') as inh:
                inh.write(r_code.format(**locals()))
            return r_code_fn
Exemple #31
0
def _sample_template(sample, out_dir):
    """R code to get QC for one sample"""
    bam_fn = dd.get_work_bam(sample)
    genome = dd.get_genome_build(sample)
    if genome in supported:
        peaks = sample.get("peaks_files", []).get("main")
        if peaks:
            r_code = ("library(ChIPQC);\n"
                      "sample = ChIPQCsample(\"{bam_fn}\","
                      "\"{peaks}\", "
                      "annotation = \"{genome}\","
                      ");\n"
                      "ChIPQCreport(sample);\n")
            r_code_fn = os.path.join(out_dir, "chipqc.r")
            with open(r_code_fn, 'w') as inh:
                inh.write(r_code.format(**locals()))
            return r_code_fn
def _generate_estimates(bam_file, out_base, failed_file, exts, data):
    background = {"dataset": "1000g.phase3",
                  "nvars": "100k",
                  "build":"b38" if dd.get_genome_build(data) == "hg38" else "b37"}
    with file_transaction(data, out_base) as tx_out_base:
        cmd = ["verifybamid2", background["dataset"], background["nvars"], background["build"],
               "--Reference", dd.get_ref_file(data), "--Output", tx_out_base]
        cmd += _get_input_args(bam_file, data, out_base, background)
        try:
            do.run(cmd, "VerifyBamID contamination checks")
        except subprocess.CalledProcessError as msg:
            def allowed_errors(l):
                return (l.find("Insufficient Available markers") >= 0 or
                        l.find("No reads found in any of the regions") >= 0)
            if any([allowed_errors(l) for l in str(msg).split("\n")]):
                logger.info("Skipping VerifyBamID, not enough overlapping markers found: %s" %
                            dd.get_sample_name(data))
                with open(failed_file, "w") as out_handle:
                    out_handle.write(str(msg))
            else:
                logger.warning(str(msg))
                raise
        else:
            # Fix any sample name problems, for pileups
            shutil.move(tx_out_base + ".selfSM", tx_out_base + ".selfSM.orig")
            with open(tx_out_base + ".selfSM.orig") as in_handle:
                with open(tx_out_base + ".selfSM", "w") as out_handle:
                    sample_name = None
                    for line in in_handle:
                        if line.startswith("DefaultSampleName"):
                            line = line.replace("DefaultSampleName", dd.get_sample_name(data))
                        # work around bug in finding SM from BAM RG at end of line
                        if len(line.strip().split("\t")) == 1:
                            sample_name = line.strip()
                            line = None
                        elif sample_name:
                            parts = line.split("\t")
                            parts[0] = sample_name
                            line = "\t".join(parts)
                            sample_name = None
                        if line:
                            out_handle.write(line)
            for e in exts + [".selfSM"]:
                if os.path.exists(tx_out_base + e):
                    shutil.copy(tx_out_base + e, out_base + e)
Exemple #33
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating rapmap {index_type} for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #34
0
def _bam_coverage(name, bam_input, data):
    """Run bamCoverage from deeptools"""
    cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} "
           "--binSize 20 --effectiveGenomeSize {size} "
           "--smoothLength 60 --extendReads 150 --centerReads -p {cores}")
    size = int(get_genome(dd.get_genome_build(data)))
    cores = dd.get_num_cores(data)
    try:
        bam_coverage = config_utils.get_program("bamCoverage", data)
    except config_utils.CmdNotFound:
        logger.info("No bamCoverage found, skipping bamCoverage.")
        return None
    bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
    if utils.file_exists(bw_output):
        return bw_output
    with file_transaction(bw_output) as out_tx:
        do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
    return bw_output
Exemple #35
0
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    to_index = determine_indexes_to_make(samples)
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data)
            or dd.get_fusion_mode(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        run_parallel("run_kallisto_index", [to_index])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        run_parallel("run_sailfish_index", [to_index])
        samples = run_parallel("run_sailfish", samples)

    # always run salmon
    run_parallel("run_salmon_index", [to_index])
    if dd.get_quantify_genome_alignments(data):
        if dd.get_aligner(data).lower() != "star":
            if dd.get_genome_build(data) == "hg38":
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Since this is hg38 we will fall "
                    "back to the decoy method")
                samples = run_parallel("run_salmon_decoy", samples)
            else:
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Falling back to the "
                    "transcriptome-only method.")
                samples = run_parallel("run_salmon_reads", samples)
        else:
            samples = run_parallel("run_salmon_bam", samples)
    else:
        samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
Exemple #36
0
def find_annotations(data):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data):
        if c not in conf_files:
            conf_files.append(c)
    conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic}
    out = []
    annodir = os.path.normpath(
        os.path.abspath(
            os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir,
                         "config", "vcfanno")))
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
        else:
            conffn = os.path.join(annodir, conf_file + ".conf")
        if conf_file in conf_checkers and not conf_checkers[conf_file](data):
            logger.warn(
                "Skipping vcfanno configuration: %s. Not all input files found."
                % conf_file)
        elif not utils.file_exists(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping."
            )
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
            if os.path.exists(luafn):
                out.append(luafn)
    return out
Exemple #37
0
def run_vcfanno(vcf, conf_files, data, data_basepath=None):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    build = dd.get_genome_build(data)
    basepath = os.path.abspath(
        os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir))
    annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno"))
    conf_fns = []
    lua_fns = []
    anno_type = None
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        else:
            anno_type = os.path.basename(conf_file)
            conffn = os.path.join(annodir, anno_type + ".conf")
            luafn = os.path.join(annodir, anno_type + ".lua")
        if not utils.file_exists(conffn):
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping."
            )
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            conf_fns.append(conffn)
            lua_fns.append(luafn)
    if not conf_fns:
        return vcf
    if not anno_type:
        anno_type = "gemini"
    out_file = utils.splitext_plus(
        vcf)[0] + "-annotated-" + anno_type + ".vcf.gz"
    if utils.file_exists(out_file):
        return out_file

    out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath
                       or basepath, lua_fns)
    return out_file
Exemple #38
0
def kallisto_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index")
    out_stem = dd.get_genome_build(data)
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    index_dir = os.path.join(out_dir, out_stem)
    out_file = os.path.join(index_dir, out_stem + ".idx")
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}"
        message = "Creating Kallisto index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
Exemple #39
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #40
0
def kallisto_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index")
    out_stem = dd.get_genome_build(data)
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    index_dir = os.path.join(out_dir, out_stem)
    out_file = os.path.join(index_dir, out_stem + ".idx")
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}"
        message = "Creating Kallisto index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
Exemple #41
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Exemple #42
0
def determine_indexes_to_make(samples):
    """
    returns a subset of the samples that have different indexes in them to make sure we only
    make each index once
    """
    samples = [to_single_data(x) for x in samples]
    indexes = set()
    tomake = []
    for data in samples:
        out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
        out_stem = os.path.join(out_dir, dd.get_genome_build(data))
        if dd.get_disambiguate(data):
            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
        if dd.get_disambiguate(data):
            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
        combined_file = out_stem + ".fa"
        if combined_file not in indexes:
            tomake.append(data)
            indexes.add(combined_file)
    return tomake
Exemple #43
0
def get_coords(data):
    """Retrieve coordinates of genes of interest for prioritization.

    Can read from CIViC input data or a supplied BED file of chrom, start, end
    and gene information.
    """
    for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}),
                             ("amplification", {"AMPLIFICATION"})]:
        out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {})
        priority_file = dd.get_svprioritize(data)
        if priority_file:
            if os.path.basename(priority_file).find("civic") >= 0:
                for chrom, start, end, gene in _civic_regions(priority_file, vtypes, dd.get_disease(data)):
                    out[gene] = (chrom, start, end)
            elif os.path.basename(priority_file).find(".bed") >= 0:
                for line in utils.open_gzipsafe(priority_file):
                    parts = line.strip().split("\t")
                    if len(parts) >= 4:
                        chrom, start, end, gene = parts[:4]
                        out[gene] = (chrom, int(start), int(end))
        yield category, out
Exemple #44
0
def _get_input_args(bam_file, data, out_base):
    """Retrieve input args, depending on genome build.

    VerifyBamID2 only handles GRCh37 (1, 2, 3) not hg19, so need to generate
    a pileup for hg19 and fix chromosome naming.
    """
    if dd.get_genome_build(data) in ["hg19"]:
        out_file = "%s-mpileup.txt" % out_base
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                mpileup_cl = samtools.prep_mpileup(
                    [bam_file],
                    dd.get_ref_file(data),
                    data["config"],
                    want_bcf=False,
                    target_regions=_get_autosomal_bed(data, tx_out_file))
                cl = ("{mpileup_cl} | sed 's/^chr//' > {tx_out_file}")
                do.run(cl.format(**locals()), "Create pileup from BAM input")
        return ["--PileupFile", out_file]
    else:
        return ["--BamFile", bam_file]
Exemple #45
0
def process_intervals(data):
    """Prepare intervals file"""
    bed_file = regions.get_sv_bed(data)
    if not bed_file:
        bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    basename = os.path.splitext(bed_file)[0]
    ready_file = basename + ".txt"
    if os.path.exists(ready_file):
        return ready_file
    optimized_bed = basename + ".optimized.bed"
    rscript = utils.Rscript_cmd("base")
    interval_file_r = utils.R_package_script("PureCN",
                                             "extdata/IntervalFile.R",
                                             env="base")
    ref_file = dd.get_ref_file(data)
    mappability_resource = dd.get_variation_resources(
        data)["purecn_mappability"]
    genome = dd.get_genome_build(data)
    tools_off = dd.get_tools_off(data)
    if tools_off and "purecn_offtarget" in tools_off:
        offtarget_flag = ""
    else:
        offtarget_flag = "--off-target"
    cmd = [
        rscript, interval_file_r, "--in-file", bed_file, "--fasta", ref_file,
        "--out-file", ready_file, offtarget_flag, "--genome", genome,
        "--export", optimized_bed, "--mappability", mappability_resource
    ]
    try:
        cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(
            env="base"), utils.get_R_exports(env="base"), " ".join(
                [str(x) for x in cmd]))
        do.run(cmd_line, "PureCN intervals")
    except subprocess.CalledProcessError as msg:
        logger.info("PureCN failed to prepare intervals")
    logger.debug("Saved PureCN interval file into " + ready_file)
    return ready_file
Exemple #46
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file,
                                pair_file != "") as (tobam_cl, tx_out_file):
            # If a single index present, index_dir points to that
            index_file = None
            if index_dir and os.path.isfile(index_dir):
                index_dir = os.path.dirname(index_dir)
                index_file = os.path.join(
                    index_dir,
                    "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = (
                "minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                "{fastq_file} {pair_file} | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Exemple #47
0
def create_combined_tx2gene(data):
    out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    items = disambiguate.split([data])
    tx2gene_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv")
        if file_exists(out_file):
            tx2gene_files.append(out_file)
        else:
            out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False)
            tx2gene_files.append(out_file)
    combined_file = os.path.join(out_dir, "tx2gene.csv")
    if file_exists(combined_file):
        return combined_file

    tx2gene_file_string = " ".join(tx2gene_files)
    cmd = "cat {tx2gene_file_string} > {tx_out_file}"
    with file_transaction(data, combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining tx2gene CSV files.")
    return combined_file
Exemple #48
0
def find_annotations(data):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations if not specified:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not conf_files:
        conf_files = _default_conf_files(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    if any([x in dd.get_tools_on(data)
            for x in ["gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand"]]):
        if annotate_gemini(data) and "gemini" not in conf_files:
            conf_files.append("gemini")
    out = []
    annodir = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                                            os.pardir, "config", "vcfanno")))
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
        else:
            conffn = os.path.join(annodir, conf_file + ".conf")
        if not utils.file_exists(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
            if os.path.exists(luafn):
                out.append(luafn)
    return out
Exemple #49
0
def get_coords(data):
    """Retrieve coordinates of genes of interest for prioritization.

    Can read from CIViC input data or a supplied BED file of chrom, start, end
    and gene information.
    """
    for category, vtypes in [("LOH", {"LOSS", "HETEROZYGOSITY"}),
                             ("amplification", {"AMPLIFICATION"})]:
        out = tz.get_in([category, dd.get_genome_build(data)], _COORDS, {})
        priority_file = dd.get_svprioritize(data)
        if priority_file:
            if os.path.basename(priority_file).find("civic") >= 0:
                for chrom, start, end, gene in _civic_regions(
                        priority_file, vtypes, dd.get_disease(data)):
                    out[gene] = (chrom, start, end)
            elif os.path.basename(priority_file).find(".bed") >= 0:
                for line in utils.open_gzipsafe(priority_file):
                    parts = line.strip().split("\t")
                    if len(parts) >= 4:
                        chrom, start, end, gene = parts[:4]
                        out[gene] = (chrom, int(start), int(end))
        yield category, out
Exemple #50
0
def run_vcfanno(vcf, conf_files, data, data_basepath=None):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    build = dd.get_genome_build(data)
    basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                            os.pardir))
    annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno"))
    conf_fns = []
    lua_fns = []
    anno_type = None
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
        else:
            anno_type = os.path.basename(conf_file)
            conffn = os.path.join(annodir, anno_type + ".conf")
            luafn = os.path.join(annodir, anno_type + ".lua")
        if not utils.file_exists(conffn):
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping.")
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            conf_fns.append(conffn)
            lua_fns.append(luafn)
    if conf_fns:
        if not anno_type:
            anno_type = "gemini"
        out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz"
        if not utils.file_exists(out_file):
            out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath or basepath, lua_fns)
        return vcfutils.bgzip_and_index(out_file, data["config"])
Exemple #51
0
def run_vcfanno(vcf, anno_type, data, data_basepath=None):
    """
    annotated a VCF file using vcfanno, looks up the proper config/lua scripts
    under the `vcfanno` key under the algorithm section of the datadict,
    skipping if the files cannot be found
    """
    build = dd.get_genome_build(data)
    basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)),
                                            os.pardir))
    annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno"))
    conffn = os.path.join(annodir, anno_type + ".conf")
    luafn = os.path.join(annodir, anno_type + ".lua")
    CONF_NOT_FOUND = (
        "The vcfanno configuration {conffn} was not found for {build}, skipping.")
    if not utils.file_exists(conffn):
        logger.warn(CONF_NOT_FOUND.format(**locals()))
        return vcf

    out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz"
    if utils.file_exists(out_file):
        return out_file

    out_file = vcfanno(vcf, out_file, conffn, data, data_basepath or basepath, luafn)
    return out_file
Exemple #52
0
def support_gemini_orig(data):
    return dd.get_genome_build(data) in set(["hg19", "GRCh37"])
Exemple #53
0
def _merge_target_information(samples, metrics_dir):
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(
        dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(
            original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed":
            vcr_orig,
            "size":
            sum(
                len(x) for x in pybedtools.BedTool(
                    dd.get_variant_regions_merged(data))),
            "regions":
            pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed,
                                                data,
                                                prefix="cov-",
                                                simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"]
                             for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Exemple #54
0
def _merge_target_information(samples, metrics_dir):
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name": dd.get_genome_build(data),
            "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed": vcr_orig,
            "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))),
            "regions": pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Exemple #55
0
def get_build_string(data):
    build_string = dd.get_genome_build(data)
    if dd.get_disambiguate(data):
        build_string = "-".join([build_string] + (dd.get_disambiguate(data) or []))
    return build_string
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem,
                tx_results_dir)
            cmd = (
                "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                "--skip-duplicated --skip-dup-mode 0 "
                "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                    or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [
                None, False, "None"
            ] else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(
                    bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()),
                   "Qualimap: %s" % dd.get_sample_name(data),
                   env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir,
                                           "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file)
    }
Exemple #57
0
def _run_purple(paired, het_file, depth_file, work_dir):
    """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs.

    XXX Need to add output conversion into VCF for standard formats
    """
    purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple"))
    out_file = os.path.join(
        purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = [
                "PURPLE", "-amber",
                os.path.dirname(het_file), "-baf", het_file, "-cobalt",
                os.path.dirname(depth_file), "-gc_profile",
                dd.get_variation_resources(paired.tumor_data)["gc_profile"],
                "-output_dir",
                os.path.dirname(tx_out_file), "-ref_genome", "hg38" if
                dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19",
                "-run_dir", work_dir, "-threads",
                dd.get_num_cores(paired.tumor_data), "-tumor_sample",
                dd.get_sample_name(paired.tumor_data), "-ref_sample",
                dd.get_sample_name(paired.normal_data)
            ]
            # Avoid X11 display errors when writing plots
            cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd])
            do.run(cmd, "PURPLE: purity and ploidy estimation")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(purple_dir, f))
    out_file_export = os.path.join(
        purple_dir,
        "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data)))
    if not utils.file_exists(out_file_export):
        utils.symlink_plus(out_file, out_file_export)
    out = {
        "variantcaller": "purple",
        "call_file": out_file_export,
        "plot": {},
        "metrics": {}
    }
    for name, ext in [("copy_number", "copyNumber"),
                      ("minor_allele", "minor_allele"),
                      ("variant", "variant")]:
        plot_file = os.path.join(
            purple_dir, "plot",
            "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext))
        if os.path.exists(plot_file):
            out["plot"][name] = plot_file
    purity_file = os.path.join(
        purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data))
    with open(purity_file) as in_handle:
        header = in_handle.readline().replace("#", "").split("\t")
        vals = in_handle.readline().split("\t")
        for h, v in zip(header, vals):
            try:
                v = float(v)
            except ValueError:
                pass
            out["metrics"][h] = v
    return out