예제 #1
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [
        utils.to_single_data(x) for x in validate.summarize_grading(items)
    ]
    out = {"validate": items[0]["validate"], "variants": {"calls": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            cur_name = "%s-%s" % (names[0], dd.get_variantcaller(data))
            if cur_name not in added:
                out_file = os.path.join(
                    utils.safe_makedir(
                        os.path.join(dd.get_work_dir(data), "variants",
                                     "calls")), "%s.vcf.gz" % cur_name)
                added.add(cur_name)
                # Ideally could symlink here but doesn't appear to work with
                # Docker container runs on Toil where PATHs don't get remapped
                utils.copy_plus(os.path.realpath(data["vrn_file"]), out_file)
                vcfutils.bgzip_and_index(out_file, data["config"])
                out["variants"]["calls"].append(out_file)
    return [out]
예제 #2
0
def main(cosmic_version, bcbio_genome_dir):
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cosmic-prep"))
    os.chdir(work_dir)

    for genome_build, bcbio_build, add_chr in [("GRCh37", "GRCh37", False), ("GRCh38", "hg38", True)]:
        bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", bcbio_build)
        if not os.path.exists(bcbio_base):
            continue
        bcbio_ref = os.path.join(bcbio_base, "seq", "%s.fa" % bcbio_build)
        sorted_inputs = []
        for fname in get_cosmic_files(genome_build, cosmic_version):
            sorted_inputs.append(sort_to_ref(fname, bcbio_ref, add_chr=add_chr))
        out_dir = utils.safe_makedir(os.path.join("v%s" % cosmic_version, "bcbio_ready", bcbio_build))
        out_file = os.path.join(out_dir, "cosmic.vcf.gz")
        ready_cosmic = combine_cosmic(sorted_inputs, bcbio_ref, out_file)
        variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation"))
        utils.copy_plus(ready_cosmic, os.path.join(variation_dir, os.path.basename(ready_cosmic)))
        print("Created COSMIC v%s resource in %s" % (cosmic_version,
                                                     os.path.join(variation_dir, os.path.basename(ready_cosmic))))
        if bcbio_build == "GRCh37":
            bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", "hg19")
            if not os.path.exists(bcbio_base):
                continue
            out_dir = utils.safe_makedir(os.path.join("v%s" % cosmic_version, "bcbio_ready", "hg19"))
            out_file = os.path.join(out_dir, "cosmic.vcf.gz")
            hg19_cosmic = map_coords_to_ucsc(ready_cosmic, bcbio_ref, out_file)
            variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation"))
            utils.copy_plus(hg19_cosmic, os.path.join(variation_dir, os.path.basename(hg19_cosmic)))
            print("Created COSMIC v%s resource in %s" % (cosmic_version,
                                                         os.path.join(variation_dir, os.path.basename(hg19_cosmic))))
예제 #3
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in validate.summarize_grading(items)]
    out = {"validate": items[0]["validate"],
           "variants": {"calls": [], "gvcf": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                if cur_name not in added:
                    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                            "variants", out_key)),
                                            "%s.vcf.gz" % cur_name)
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    return [out]
예제 #4
0
def main(cosmic_version, bcbio_genome_dir, overwrite=False, clean=False):
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cosmic-prep"))
    os.chdir(work_dir)

    for genome_build, bcbio_build, add_chr in [("GRCh37", "GRCh37", False), ("GRCh38", "hg38", True)]:
        bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", bcbio_build)
        installed_file = os.path.join(bcbio_base, "variation", f"cosmic-v{cosmic_version}.vcf.gz")
        installed_link = os.path.join(bcbio_base, "variation", "cosmic.vcf.gz")
        logging.info(f"Beginning COSMIC v{cosmic_version} prep for {genome_build}.")
        if not os.path.exists(bcbio_base):
            continue
        if os.path.exists(installed_file):
            if not overwrite:
                logging.info(f"{installed_file} exists, please use the --overwrite flag to overwrite the existing files if you want to reinstall.")
                continue
            else:
                logging.info(f"{installed_file} exists, removing.")
                remove_installed(installed_file, installed_link)
        bcbio_ref = os.path.join(bcbio_base, "seq", f"{bcbio_build}.fa")
        cosmic_vcf_files = get_cosmic_vcf_files(genome_build, cosmic_version, clean)
        sorted_inputs = []
        for fname in cosmic_vcf_files:
            sorted_inputs.append(sort_to_ref(fname, bcbio_ref, add_chr=add_chr))
        out_dir = utils.safe_makedir(os.path.join(f"v{cosmic_version}", "bcbio_ready", bcbio_build))
        out_file = os.path.join(out_dir, "cosmic.vcf.gz")
        ready_cosmic = combine_cosmic(sorted_inputs, bcbio_ref, out_file)
        variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation"))
        utils.copy_plus(ready_cosmic, installed_file)
        logging.info(f"Created COSMIC v{cosmic_version} resource in {installed_file}.")
        logging.info(f"Linking {installed_file} as {installed_link}.")
        make_links(installed_file, installed_link)
        update_version_file(bcbio_base, cosmic_version)
        logging.info(f"Finished COSMIC v{cosmic_version} prep for {genome_build}.")
        # prepare hg19 from the GRCh37 file
        if bcbio_build == "GRCh37":
            genome_build = "hg19"
            logging.info(f"Prepping COSMIC v{cosmic_version} for {genome_build} from the GRCh37 preparation.")
            bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", genome_build)
            if not os.path.exists(bcbio_base):
                continue
            if os.path.exists(installed_file):
                installed_file = os.path.join(bcbio_base, "variation", f"cosmic-v{cosmic_version}.vcf.gz")
                installed_link = os.path.join(bcbio_base, "variation", "cosmic.vcf.gz")
                if not overwrite:
                    logging.info(f"{installed_file} exists, please use the --overwrite flag to overwrite the existing files if you want to reinstall.")
                    continue
                else:
                    logging.info(f"{installed_file} exists, removing.")
                    remove_installed(installed_file, installed_link)
            out_dir = utils.safe_makedir(os.path.join(f"v{cosmic_version}", "bcbio_ready", genome_build))
            out_file = os.path.join(out_dir, f"cosmic-v{cosmic_version}.vcf.gz")
            logging.info(f"Translating GRCh37 chromosome names to hg19 chromosome names.")
            hg19_cosmic = map_coords_to_ucsc(ready_cosmic, bcbio_ref, out_file)
            variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation"))
            utils.copy_plus(hg19_cosmic, installed_file)
            logging.info(f"Created COSMIC v{cosmic_version} resource in {installed_file}.")
            logging.info(f"Linking {installed_file} as {installed_link}.")
            make_links(installed_file, installed_link)
            update_version_file(bcbio_base, cosmic_version)
            logging.info(f"Finished COSMIC v{cosmic_version} prep for {genome_build}.")
예제 #5
0
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    try:
        input_rec = iter(pybedtools.BedTool(in_file)).next()
    except StopIteration:  # empty file
        utils.copy_plus(in_file, out_file)
        return
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = range(4, len(input_rec.fields) + 1)
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                       (max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd()
    cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
    # Ensure gene transcripts match reference genome
    ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" %
                                   (utils.splitext_plus(os.path.basename(gene_file))[0]))
    ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data)
    cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | "
           "{sort_cmd} -k1,1 -k2,2n | "
            "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
            "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {ready_gene_file}) | "
            "{distance_filter} | cut -f 1-{max_column} | "
            "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")
예제 #6
0
def coverage_region_detailed_stats(target_name,
                                   bed_file,
                                   data,
                                   out_dir,
                                   extra_cutoffs=None):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    if not bed_file or not utils.file_exists(bed_file):
        return []
    else:
        ready_depth = tz.get_in(["depth", target_name], data)
        if ready_depth:
            cov_file = ready_depth["regions"]
            dist_file = ready_depth["dist"]
        else:
            mosdepth_cov = run_mosdepth(data, target_name, bed_file)
            cov_file = mosdepth_cov.regions
            dist_file = mosdepth_cov.dist
        out_cov_file = os.path.join(out_dir, os.path.basename(cov_file))
        out_dist_file = os.path.join(out_dir, os.path.basename(dist_file))
        if not utils.file_uptodate(out_cov_file, cov_file):
            utils.copy_plus(cov_file, out_cov_file)
            utils.copy_plus(dist_file, out_dist_file)
        cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000}
        if extra_cutoffs:
            cutoffs = sorted(list(cutoffs | extra_cutoffs))
        out_files = _calculate_percentiles(out_dist_file, cutoffs, out_dir,
                                           data)
        return [os.path.abspath(x) for x in out_files]
예제 #7
0
def _make_examples(bam_file, data, ref_file, region, out_file, work_dir):
    """Create example pileup images to feed into variant calling.
    """
    region_bed = strelka2.get_region_bed(region, [data],
                                         out_file,
                                         want_gzip=False)
    log_dir = utils.safe_makedir(os.path.join(work_dir, "log"))
    example_dir = utils.safe_makedir(os.path.join(work_dir, "examples"))
    if len(
            glob.glob(
                os.path.join(example_dir, "%s.tfrecord*.gz" %
                             dd.get_sample_name(data)))) == 0:
        with tx_tmpdir(data) as tx_example_dir:
            cmd = [
                "dv_make_examples.py", "--cores",
                dd.get_num_cores(data), "--ref", ref_file, "--reads", bam_file,
                "--regions", region_bed, "--logdir", log_dir, "--examples",
                tx_example_dir, "--sample",
                dd.get_sample_name(data)
            ]
            do.run(cmd,
                   "DeepVariant make_examples %s" % dd.get_sample_name(data))
            for fname in glob.glob(
                    os.path.join(tx_example_dir, "%s.tfrecord*.gz" %
                                 dd.get_sample_name(data))):
                utils.copy_plus(
                    fname, os.path.join(example_dir, os.path.basename(fname)))
    return example_dir
예제 #8
0
def _calculate_percentiles(cov_file, dist_file, cutoffs, out_dir, data):
    """Calculate percentage over over specified cutoff range.

    XXX Does not calculate the per-bin coverage estimations which we had
    earlier with sambamba depth. Instead has a global metric of percent coverage
    which provides a more defined look at coverage changes by depth.
    """
    if not utils.file_exists(dist_file) or not utils.file_exists(cov_file):
        return []
    sample = dd.get_sample_name(data)
    out_total_file = append_stem(dist_file, "_total_summary")
    if not utils.file_exists(out_total_file):
        with file_transaction(data, out_total_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                writer = csv.writer(out_handle, dialect="excel-tab")
                writer.writerow(["cutoff_reads", "bases_pct", "sample"])
                with open(dist_file) as in_handle:
                    for line in in_handle:
                        count, pct = line.strip().split()
                        count = int(count)
                        pct = "%.1f" % (float(pct) * 100.0)
                        if count >= min(cutoffs) and count <= max(cutoffs):
                            writer.writerow(
                                ["percentage%s" % count, pct, sample])
                    if min(cutoffs) < count:
                        writer.writerow(
                            ["percentage%s" % min(cutoffs), pct, sample])
    # To move metrics to multiqc, will remove older files
    # when bcbreport accepts these one, to avoid errors
    # while porting everything to multiqc
    # These files will be copied to final
    out_total_fixed = os.path.join(os.path.dirname(out_total_file),
                                   "%s_bcbio_coverage_avg.txt" % sample)
    copy_plus(out_total_file, out_total_fixed)
    return [out_total_fixed]
예제 #9
0
def summarize_sv(items):
    """CWL target: summarize structural variants for multiple samples.

    XXX Need to support non-VCF output as tabix indexed output
    """
    items = [utils.to_single_data(x) for x in items]
    out = {"sv": {"calls": []}}
    added = set([])
    for data in items:
        if data.get("sv"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"])
            ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1]
            if cur_name not in added and ext.startswith(".vcf"):
                added.add(cur_name)
                out_file = os.path.join(
                    utils.safe_makedir(
                        os.path.join(dd.get_work_dir(data), "sv", "calls")),
                    "%s%s" % (cur_name, ext))
                utils.copy_plus(data["sv"]["vrn_file"], out_file)
                out_file = vcfutils.bgzip_and_index(out_file, data["config"])
                out["sv"]["calls"].append(out_file)
    return [out]
예제 #10
0
def organize_noalign(data):
    """CWL target to skip alignment and organize input data.
    """
    data = utils.to_single_data(data[0])
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)))
    work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data))
    utils.copy_plus(data["files"][0], work_bam)
    bam.index(work_bam, data["config"])
    data["align_bam"] = work_bam
    return data
예제 #11
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir)
            _run_workflow(items[0], workflow_file, tx_work_dir)
        raw_file = os.path.join(work_dir, "results", "variants",
                                "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        utils.copy_plus(raw_file, out_file)
        # Remove files with relative symlinks
        utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
예제 #12
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {
        "validate": validate.combine_validations(items),
        "variants": {
            "calls": [],
            "gvcf": [],
            "samples": []
        }
    }
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(
                data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(
                    utils.safe_makedir(
                        os.path.join(dd.get_work_dir(data), "variants",
                                     out_key)), "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
예제 #13
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir)
            _run_workflow(items[0], workflow_file, tx_work_dir)
        raw_file = os.path.join(work_dir, "results", "variants",
                                "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        utils.copy_plus(raw_file, out_file)
        # Remove files with relative symlinks
        utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
예제 #14
0
def _symlink_or_copy_grabix(in_file, out_file, data):
    """We cannot symlink in CWL, but may be able to use inputs or copy
    """
    if cwlutils.is_cwl_run(data):
        # Has grabix indexes, we're okay to go
        if utils.file_exists(in_file + ".gbi"):
            out_file = in_file
        else:
            utils.copy_plus(in_file, out_file)
    else:
        utils.symlink_plus(in_file, out_file)
    return out_file
예제 #15
0
def _calculate_percentiles(in_file, sample, data=None, cutoffs=None):
    """
    Parse pct bases per region to summarize it in
    7 different pct of regions points with pct bases covered
    higher than a completeness cutoff (5, 10, 20, 50 ...)
    """
    has_data = False
    with open(in_file) as in_handle:
        for i, line in enumerate(in_handle):
            if i > 0:
                has_data = True
                break
    if not has_data:
        return []
    out_file = append_stem(in_file, "_summary")
    out_total_file = append_stem(in_file, "_total_summary")
    if not utils.file_exists(out_file) or not utils.file_exists(
            out_total_file):
        dt = pd.read_csv(in_file, sep="\t", index_col=False)
        pct = dict()
        pct_bases = dict()
        size = np.array(dt["chromEnd"]) - np.array(dt["chromStart"])
        for cutoff in [h for h in list(dt) if h.startswith("percentage")]:
            if cutoffs and int(cutoff.split("percentage")[1]) in cutoffs:
                a = np.array(dt[cutoff])
                for p_point in [0.01, 10, 25, 50, 75, 90, 99.9]:
                    q = np.percentile(a, p_point)
                    pct[(cutoff, p_point)] = q
                pct_bases[cutoff] = sum(size * a) / float(sum(size))

        with file_transaction(data, out_total_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >> out_handle, "cutoff_reads\tbases_pct\tsample"
                for k in pct_bases:
                    print >> out_handle, "\t".join(
                        map(str, [k, pct_bases[k], sample]))
        with file_transaction(data, out_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >> out_handle, "cutoff_reads\tregion_pct\tbases_pct\tsample"
                for k in pct:
                    print >> out_handle, "\t".join(
                        map(str, [k[0], k[1], pct[k], sample]))
    # To move metrics to multiqc, will remove older files
    # when bcbreport accepts these one, to avoid errors
    # while porting everything to multiqc
    # These files will be copied to final
    out_file_fixed = os.path.join(os.path.dirname(out_file),
                                  "%s_bcbio_coverage.txt" % sample)
    out_total_fixed = os.path.join(os.path.dirname(out_file),
                                   "%s_bcbio_coverage_avg.txt" % sample)
    copy_plus(out_file, out_file_fixed)
    copy_plus(out_total_file, out_total_fixed)
    return [out_file_fixed, out_total_fixed]
예제 #16
0
def _symlink_or_copy_grabix(in_file, out_file, data):
    """We cannot symlink in CWL, but may be able to use inputs or copy
    """
    if cwlutils.is_cwl_run(data):
        # Has grabix indexes, we're okay to go
        if utils.file_exists(in_file + ".gbi"):
            out_file = in_file
        else:
            utils.copy_plus(in_file, out_file)
    else:
        utils.symlink_plus(in_file, out_file)
    return out_file
예제 #17
0
def summarize_sv(items):
    """CWL target: summarize structural variants for multiple samples.

    XXX Need to support non-VCF output as tabix indexed output
    """
    items = [
        utils.to_single_data(x)
        for x in vcvalidate.summarize_grading(items, "svvalidate")
    ]
    out = {
        "sv": {
            "calls": [],
            "prioritize": {
                "tsv": [],
                "raw": []
            }
        },
        "svvalidate": vcvalidate.combine_validations(items, "svvalidate")
    }
    added = set([])
    # Standard callers
    for data in items:
        if data.get("sv"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"])
            if data["sv"].get("vrn_file"):
                ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1]
                if cur_name not in added and ext.startswith(".vcf"):
                    added.add(cur_name)
                    out_file = os.path.join(
                        utils.safe_makedir(
                            os.path.join(dd.get_work_dir(data), "sv",
                                         "calls")), "%s%s" % (cur_name, ext))
                    utils.copy_plus(data["sv"]["vrn_file"], out_file)
                    out_file = vcfutils.bgzip_and_index(
                        out_file, data["config"])
                    out["sv"]["calls"].append(out_file)
    # prioritization
    for pdata in _group_by_sample(items):
        prioritysv = [
            x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get(
                "sv", []) if x["variantcaller"] == "sv-prioritize"
        ]
        if prioritysv:
            out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"])
            out["sv"]["prioritize"]["raw"].extend(
                prioritysv[0]["raw_files"].values())
    return [out]
예제 #18
0
def _make_examples(bam_file, data, ref_file, region_bed, out_file, work_dir):
    """Create example pileup images to feed into variant calling.
    """
    log_dir = utils.safe_makedir(os.path.join(work_dir, "log"))
    example_dir = utils.safe_makedir(os.path.join(work_dir, "examples"))
    if len(glob.glob(os.path.join(example_dir, "%s.tfrecord*.gz" % dd.get_sample_name(data)))) == 0:
        with tx_tmpdir(data) as tx_example_dir:
            cmd = ["dv_make_examples.py", "--cores", dd.get_num_cores(data), "--ref", ref_file,
                   "--reads", bam_file, "--regions", region_bed, "--logdir", log_dir,
                   "--examples", tx_example_dir, "--sample", dd.get_sample_name(data)]
            do.run(cmd, "DeepVariant make_examples %s" % dd.get_sample_name(data))
            for fname in glob.glob(os.path.join(tx_example_dir, "%s.tfrecord*.gz" % dd.get_sample_name(data))):
                utils.copy_plus(fname, os.path.join(example_dir, os.path.basename(fname)))
    return example_dir
예제 #19
0
def _link_bam_file(in_file, new_dir, data):
    """Provide symlinks of BAM file and existing indexes if needed.
    """
    new_dir = utils.safe_makedir(new_dir)
    out_file = os.path.join(new_dir, os.path.basename(in_file))
    if data.get("cwl_keys"):
        # Has indexes, we're okay to go with the original file
        if utils.file_exists(in_file + ".bai"):
            out_file = in_file
        else:
            utils.copy_plus(in_file, out_file)
    else:
        utils.symlink_plus(in_file, out_file)
    return out_file
예제 #20
0
def _handle_precalled(data):
    """Copy in external pre-called variants fed into analysis.
    """
    if data.get("vrn_file"):
        vrn_file = data["vrn_file"]
        if isinstance(vrn_file, (list, tuple)):
            assert len(vrn_file) == 1
            vrn_file = vrn_file[0]
        precalled_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "precalled"))
        ext = utils.splitext_plus(vrn_file)[-1]
        orig_file = os.path.abspath(vrn_file)
        our_vrn_file = os.path.join(precalled_dir, "%s-precalled%s" % (dd.get_sample_name(data), ext))
        utils.copy_plus(orig_file, our_vrn_file)
        data["vrn_file"] = our_vrn_file
    return data
예제 #21
0
def _handle_precalled(data):
    """Copy in external pre-called variants fed into analysis.
    """
    if data.get("vrn_file"):
        vrn_file = data["vrn_file"]
        if isinstance(vrn_file, (list, tuple)):
            assert len(vrn_file) == 1
            vrn_file = vrn_file[0]
        precalled_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "precalled"))
        ext = utils.splitext_plus(vrn_file)[-1]
        orig_file = os.path.abspath(vrn_file)
        our_vrn_file = os.path.join(precalled_dir, "%s-precalled%s" % (dd.get_sample_name(data), ext))
        utils.copy_plus(orig_file, our_vrn_file)
        data["vrn_file"] = our_vrn_file
    return data
예제 #22
0
def _calculate_percentiles(in_file, sample, data=None, cutoffs=None):
    """
    Parse pct bases per region to summarize it in
    7 different pct of regions points with pct bases covered
    higher than a completeness cutoff (5, 10, 20, 50 ...)
    """
    has_data = False
    with open(in_file) as in_handle:
        for i, line in enumerate(in_handle):
            if i > 0:
                has_data = True
                break
    if not has_data:
        return []
    out_file = append_stem(in_file, "_summary")
    out_total_file = append_stem(in_file, "_total_summary")
    if not utils.file_exists(out_file) or not utils.file_exists(out_total_file):
        dt = pd.read_csv(in_file, sep="\t", index_col=False)
        pct = dict()
        pct_bases = dict()
        size = np.array(dt["chromEnd"]) - np.array(dt["chromStart"])
        for cutoff in [h for h in list(dt) if h.startswith("percentage")]:
            if cutoffs and int(cutoff.split("percentage")[1]) in cutoffs:
                a = np.array(dt[cutoff])
                for p_point in [0.01, 10, 25, 50, 75, 90, 99.9]:
                    q = np.percentile(a, p_point)
                    pct[(cutoff, p_point)] = q
                pct_bases[cutoff] = sum(size * a) / float(sum(size))

        with file_transaction(data, out_total_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >>out_handle, "cutoff_reads\tbases_pct\tsample"
                for k in pct_bases:
                    print >>out_handle, "\t".join(map(str, [k, pct_bases[k], sample]))
        with file_transaction(data, out_file) as tx_file:
            with open(tx_file, 'w') as out_handle:
                print >>out_handle, "cutoff_reads\tregion_pct\tbases_pct\tsample"
                for k in pct:
                    print >>out_handle, "\t".join(map(str, [k[0], k[1], pct[k], sample]))
    # To move metrics to multiqc, will remove older files
    # when bcbreport accepts these one, to avoid errors
    # while porting everything to multiqc
    # These files will be copied to final
    out_file_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage.txt" % sample)
    out_total_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage_avg.txt" % sample)
    copy_plus(out_file, out_file_fixed)
    copy_plus(out_total_file, out_total_fixed)
    return [out_file_fixed, out_total_fixed]
예제 #23
0
def _link_bam_file(in_file, new_dir, data):
    """Provide symlinks of BAM file and existing indexes if needed.
    """
    new_dir = utils.safe_makedir(new_dir)
    out_file = os.path.join(new_dir, os.path.basename(in_file))
    if not utils.file_exists(out_file):
        out_file = os.path.join(new_dir, "%s-prealign.bam" % dd.get_sample_name(data))
    if data.get("cwl_keys"):
        # Has indexes, we're okay to go with the original file
        if utils.file_exists(in_file + ".bai"):
            out_file = in_file
        else:
            utils.copy_plus(in_file, out_file)
    else:
        utils.symlink_plus(in_file, out_file)
    return out_file
예제 #24
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        # We cannot symlink in CWL, but may be able to use inputs or copy
        if data.get("is_cwl"):
            # Has grabix indexes, we're okay to go
            if utils.file_exists(in_file + ".gbi"):
                return in_file
            else:
                return utils.copy_plus(in_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
예제 #25
0
def organize_noalign(data):
    """CWL target to skip alignment and organize input data.
    """
    data = utils.to_single_data(data[0])
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)))
    work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data))
    if data.get("files"):
        if data["files"][0].endswith(".cram"):
            work_bam = cram.to_bam(data["files"][0], work_bam, data)
        else:
            assert data["files"][0].endswith(".bam"), data["files"][0]
            utils.copy_plus(data["files"][0], work_bam)
        bam.index(work_bam, data["config"])
    else:
        work_bam = None
    data["align_bam"] = work_bam
    return data
예제 #26
0
def organize_noalign(data):
    """CWL target to skip alignment and organize input data.
    """
    data = utils.to_single_data(data[0])
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)))
    work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data))
    if data.get("files"):
        if data["files"][0].endswith(".cram"):
            work_bam = cram.to_bam(data["files"][0], work_bam, data)
        else:
            assert data["files"][0].endswith(".bam"), data["files"][0]
            utils.copy_plus(data["files"][0], work_bam)
        bam.index(work_bam, data["config"])
    else:
        work_bam = None
    data["align_bam"] = work_bam
    return data
예제 #27
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {"validate": validate.combine_validations(items),
           "variants": {"calls": [], "gvcf": [], "samples": []}}
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                        "variants", out_key)),
                                        "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
예제 #28
0
def _handle_precalled(data):
    """Copy in external pre-called variants fed into analysis.

    Symlinks for non-CWL runs where we want to ensure VCF present
    in a local directory.
    """
    if data.get("vrn_file") and not cwlutils.is_cwl_run(data):
        vrn_file = data["vrn_file"]
        if isinstance(vrn_file, (list, tuple)):
            assert len(vrn_file) == 1
            vrn_file = vrn_file[0]
        precalled_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "precalled"))
        ext = utils.splitext_plus(vrn_file)[-1]
        orig_file = os.path.abspath(vrn_file)
        our_vrn_file = os.path.join(precalled_dir, "%s-precalled%s" % (dd.get_sample_name(data), ext))
        utils.copy_plus(orig_file, our_vrn_file)
        data["vrn_file"] = our_vrn_file
    return data
예제 #29
0
def main(cosmic_version, bcbio_genome_dir):
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cosmic-prep"))
    os.chdir(work_dir)

    for genome_build, bcbio_build, add_chr in [("GRCh37", "GRCh37", False),
                                               ("GRCh38", "hg38", True)]:
        bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens",
                                  bcbio_build)
        if not os.path.exists(bcbio_base):
            continue
        bcbio_ref = os.path.join(bcbio_base, "seq", "%s.fa" % bcbio_build)
        sorted_inputs = []
        for fname in get_cosmic_files(genome_build, cosmic_version):
            sorted_inputs.append(sort_to_ref(fname, bcbio_ref,
                                             add_chr=add_chr))
        out_dir = utils.safe_makedir(
            os.path.join("v%s" % cosmic_version, "bcbio_ready", bcbio_build))
        out_file = os.path.join(out_dir, "cosmic.vcf.gz")
        ready_cosmic = combine_cosmic(sorted_inputs, bcbio_ref, out_file)
        variation_dir = utils.safe_makedir(
            os.path.join(bcbio_base, "variation"))
        utils.copy_plus(
            ready_cosmic,
            os.path.join(variation_dir, os.path.basename(ready_cosmic)))
        print("Created COSMIC v%s resource in %s" %
              (cosmic_version,
               os.path.join(variation_dir, os.path.basename(ready_cosmic))))
        if bcbio_build == "GRCh37":
            bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens",
                                      "hg19")
            if not os.path.exists(bcbio_base):
                continue
            out_dir = utils.safe_makedir(
                os.path.join("v%s" % cosmic_version, "bcbio_ready", "hg19"))
            out_file = os.path.join(out_dir, "cosmic.vcf.gz")
            hg19_cosmic = map_coords_to_ucsc(ready_cosmic, bcbio_ref, out_file)
            variation_dir = utils.safe_makedir(
                os.path.join(bcbio_base, "variation"))
            utils.copy_plus(
                hg19_cosmic,
                os.path.join(variation_dir, os.path.basename(hg19_cosmic)))
            print("Created COSMIC v%s resource in %s" %
                  (cosmic_version,
                   os.path.join(variation_dir, os.path.basename(hg19_cosmic))))
예제 #30
0
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000):
    """Re-usable subcomponent that annotates BED file genes from another BED
    """
    try:
        input_rec = next(iter(pybedtools.BedTool(in_file)))
    except StopIteration:  # empty file
        utils.copy_plus(in_file, out_file)
        return
    # keep everything after standard chrom/start/end, 1-based
    extra_fields = list(range(4, len(input_rec.fields) + 1))
    # keep the new gene annotation
    gene_index = len(input_rec.fields) + 4
    extra_fields.append(gene_index)
    columns = ",".join([str(x) for x in extra_fields])
    max_column = max(extra_fields) + 1
    ops = ",".join(["distinct"] * len(extra_fields))
    # swap over gene name to '.' if beyond maximum distance
    # cut removes the last distance column which can cause issues
    # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
    distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s || $NF < -%s) $%s = "."} {print}'""" %
                       (max_distance, max_distance, gene_index))
    sort_cmd = bedutils.get_sort_cmd(os.path.dirname(out_file))
    cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
    # Ensure gene transcripts match reference genome
    ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" %
                                   (utils.splitext_plus(os.path.basename(gene_file))[0]))
    ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data)
    exports = "export TMPDIR=%s && %s" % (os.path.dirname(out_file), utils.local_path_export())
    bcbio_py = sys.executable
    gsort = config_utils.get_program("gsort", data)
    cmd = ("{exports}{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | "
           "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | "
           "{gsort} - {fai_file} | "
            "bedtools closest -g {fai_file} "
            "-D ref -t first -a - -b <({gsort} {ready_gene_file} {fai_file}) | "
            "{distance_filter} | cut -f 1-{max_column} | "
            "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}")
    do.run(cmd.format(**locals()), "Annotate BED file with gene info")
예제 #31
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file,
                  work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file,
                                                region, out_file, tx_work_dir)
            if workflow_file:
                has_variants = True
                _run_workflow(items[0], workflow_file, tx_work_dir)
            else:
                has_variants = False
                vcfutils.write_empty_vcf(
                    out_file, items[0]["config"],
                    [dd.get_sample_name(d) for d in items])
        if has_variants:
            raw_file = os.path.join(
                work_dir, "results", "variants", "genome.vcf.gz"
                if joint.want_gvcf(items) else "variants.vcf.gz")
            utils.copy_plus(raw_file, out_file)
            # Remove files with relative symlinks
            utils.remove_plus(
                os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
예제 #32
0
def coverage_region_detailed_stats(bed_file,
                                   data,
                                   out_dir,
                                   extra_cutoffs=None):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    if not bed_file or not utils.file_exists(bed_file):
        return []
    else:
        cov_file, dist_file = _run_mosdepth(bed_file, data)
        out_cov_file = os.path.join(out_dir, os.path.basename(cov_file))
        out_dist_file = os.path.join(out_dir, os.path.basename(dist_file))
        if not utils.file_uptodate(out_cov_file, cov_file):
            utils.copy_plus(cov_file, out_cov_file)
            utils.copy_plus(dist_file, out_dist_file)
        cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000}
        if extra_cutoffs:
            cutoffs = sorted(list(cutoffs | extra_cutoffs))
        out_files = _calculate_percentiles(out_cov_file, out_dist_file,
                                           cutoffs, out_dir, data)
        return [os.path.abspath(x) for x in out_files]
예제 #33
0
def summarize_sv(items):
    """CWL target: summarize structural variants for multiple samples.

    XXX Need to support non-VCF output as tabix indexed output
    """
    items = [utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate")]
    out = {"sv": {"calls": [],
                  "supplemental": [],
                  "prioritize": {"tsv": [],
                                 "raw": []}},
           "svvalidate": vcvalidate.combine_validations(items, "svvalidate")}
    added = set([])
    # Standard callers
    for data in items:
        if data.get("sv"):
            if data["sv"].get("vrn_file"):
                ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1]
                cur_name = _useful_basename(data)
                if cur_name not in added and ext.startswith(".vcf"):
                    added.add(cur_name)
                    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                            "sv", "calls")),
                                            "%s%s" % (cur_name, ext))
                    utils.copy_plus(data["sv"]["vrn_file"], out_file)
                    out_file = vcfutils.bgzip_and_index(out_file, data["config"])
                    out["sv"]["calls"].append(out_file)
            if data["sv"].get("supplemental"):
                out["sv"]["supplemental"].extend([x for x in data["sv"]["supplemental"] if x])
    # prioritization
    for pdata in _group_by_sample(items):
        prioritysv = [x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get("sv", [])
                      if x["variantcaller"] == "sv-prioritize"]
        if prioritysv:
            out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"])
            out["sv"]["prioritize"]["raw"].extend(prioritysv[0]["raw_files"].values())
    return [out]
예제 #34
0
def coverage_region_detailed_stats(target_name, bed_file, data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    if bed_file and utils.file_exists(bed_file):
        ready_depth = tz.get_in(["depth", target_name], data)
        if ready_depth:
            cov_file = ready_depth["regions"]
            dist_file = ready_depth["dist"]
            thresholds_file = ready_depth.get("thresholds")
            out_cov_file = os.path.join(out_dir, os.path.basename(cov_file))
            out_dist_file = os.path.join(out_dir, os.path.basename(dist_file))
            out_thresholds_file = os.path.join(out_dir, os.path.basename(thresholds_file)) \
                if thresholds_file and os.path.isfile(thresholds_file) else None
            if not utils.file_uptodate(out_cov_file, cov_file):
                utils.copy_plus(cov_file, out_cov_file)
                utils.copy_plus(dist_file, out_dist_file)
                utils.copy_plus(thresholds_file, out_thresholds_file) if out_thresholds_file else None
            return [out_cov_file, out_dist_file] + ([out_thresholds_file] if out_thresholds_file else [])
    return []
예제 #35
0
def coverage_region_detailed_stats(target_name, bed_file, data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    if bed_file and utils.file_exists(bed_file):
        ready_depth = tz.get_in(["depth", target_name], data)
        if ready_depth:
            cov_file = ready_depth["regions"]
            dist_file = ready_depth["dist"]
            thresholds_file = ready_depth.get("thresholds")
            out_cov_file = os.path.join(out_dir, os.path.basename(cov_file))
            out_dist_file = os.path.join(out_dir, os.path.basename(dist_file))
            out_thresholds_file = os.path.join(out_dir, os.path.basename(thresholds_file)) \
                if thresholds_file and os.path.isfile(thresholds_file) else None
            if not utils.file_uptodate(out_cov_file, cov_file):
                utils.copy_plus(cov_file, out_cov_file)
                utils.copy_plus(dist_file, out_dist_file)
                utils.copy_plus(thresholds_file, out_thresholds_file) if out_thresholds_file else None
            return [out_cov_file, out_dist_file] + ([out_thresholds_file] if out_thresholds_file else [])
    return []
예제 #36
0
def _report_summary(samples, out_dir):
    """
    Run coverage report with bcbiocov package
    """
    try:
        import bcbreport.prepare as bcbreport
    except ImportError:
        logger.info("skipping report. No bcbreport installed.")
        return samples
    # samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    parent_dir = utils.safe_makedir(out_dir)
    with utils.chdir(parent_dir):
        logger.info("copy qsignature")
        qsignature_fn = os.path.join(work_dir, "qc", "qsignature",
                                     "qsignature.ma")
        if qsignature_fn:  # this need to be inside summary/qc dict
            if utils.file_exists(
                    qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "bcbio_qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):
            _merge_fastqc(samples)

        logger.info("summarize metrics")
        samples = _merge_metrics(samples)

        logger.info("summarize target information")
        samples = _merge_target_information(samples)

        out_dir = utils.safe_makedir("coverage")
        logger.info("summarize coverage")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "coverage"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("coverage_fixed") > -1:
                    utils.copy_plus(
                        fn, os.path.join(out_dir, os.path.basename(fn)))

        out_dir = utils.safe_makedir("variants")
        logger.info("summarize variants")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "variants"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("gc-depth-parse.tsv") > -1:
                    utils.copy_plus(
                        fn, os.path.join(out_dir, os.path.basename(fn)))
        bcbreport.report(parent_dir)
        out_report = os.path.join(parent_dir, "qc-coverage-report.html")
        if not utils.file_exists(out_report):
            rmd_file = os.path.join(parent_dir, "report-ready.Rmd")
            run_file = "%s-run.R" % (os.path.splitext(out_report)[0])
            with open(run_file, "w") as out_handle:
                out_handle.write("""library(rmarkdown)\nrender("%s")\n""" %
                                 rmd_file)
            cmd = "%s %s" % (utils.Rscript_cmd(), run_file)
            # Skip automated generation of coverage report to avoid error
            # messages. We need to generalize coverage reporting and re-include.
            # try:
            #     do.run(cmd, "Prepare coverage summary", log_error=False)
            # except subprocess.CalledProcessError as msg:
            #     logger.info("Skipping generation of coverage report: %s" % (str(msg)))
            if utils.file_exists("report-ready.html"):
                shutil.move("report-ready.html", out_report)
    return samples
예제 #37
0
def _report_summary(samples, out_dir):
    """
    Run coverage report with bcbiocov package
    """
    try:
        import bcbreport.prepare as bcbreport
    except ImportError:
        logger.info("skipping report. No bcbreport installed.")
        return samples
    # samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    parent_dir = utils.safe_makedir(out_dir)
    with utils.chdir(parent_dir):
        logger.info("copy qsignature")
        qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma")
        if qsignature_fn:  # this need to be inside summary/qc dict
            if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "bcbio_qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):
            _merge_fastqc(samples)

        logger.info("summarize metrics")
        samples = _merge_metrics(samples)

        out_dir = utils.safe_makedir("coverage")
        logger.info("summarize coverage")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "coverage"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("coverage_fixed") > -1:
                    utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn)))

        out_dir = utils.safe_makedir("variants")
        logger.info("summarize variants")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "variants"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("gc-depth-parse.tsv") > -1:
                    utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn)))
        bcbreport.report(parent_dir)
        out_report = os.path.join(parent_dir, "qc-coverage-report.html")
        if not utils.file_exists(out_report):
            rmd_file = os.path.join(parent_dir, "report-ready.Rmd")
            run_file = "%s-run.R" % (os.path.splitext(out_report)[0])
            with open(run_file, "w") as out_handle:
                out_handle.write("""library(rmarkdown)\nrender("%s")\n""" % rmd_file)
            cmd = "%s %s" % (utils.Rscript_cmd(), run_file)
            # Skip automated generation of coverage report to avoid error
            # messages. We need to generalize coverage reporting and re-include.
            # try:
            #     do.run(cmd, "Prepare coverage summary", log_error=False)
# except subprocess.CalledProcessError, msg:
            #     logger.info("Skipping generation of coverage report: %s" % (str(msg)))
            if utils.file_exists("report-ready.html"):
                shutil.move("report-ready.html", out_report)
    return samples