Ejemplo n.º 1
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)

        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        # Fixing the file name: MultiQC picks sample name from BAM file name.
        fixed_bam_fname = os.path.join(out_dir,
                                       dd.get_sample_name(data) + ".bam")
        if not os.path.islink(fixed_bam_fname):
            os.symlink(bam_file, fixed_bam_fname)

        export = utils.local_path_export()
        cmd = (
            "unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} "
            "--skip-duplicated --skip-dup-mode 0 "
            "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = None
        if tz.get_in(("genome_resources", "aliases", "human"), data, ""):
            species = "HUMAN"
        elif any(
                tz.get_in("genome_build", data, "").startswith(k)
                for k in ["mm", "GRCm"]):
            species = "MOUSE"
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(
            dd.get_coverage(data), data) or dd.get_variant_regions_merged(data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()),
               "Qualimap: %s" % dd.get_sample_name(data))

    # return _parse_qualimap_metrics(report_file, data)
    return dict()
Ejemplo n.º 2
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Ejemplo n.º 3
0
def starts_by_depth(bam_file, data, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (sample_size / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    downsampled = bam.downsample(bam_file, data, sample_size)
    with bam.open_samfile(downsampled) as samfile:
        for read in samfile:
            if read.is_unmapped:
                continue
            counted += 1
            buffer.append(str(read.tid) + ":" + str(read.pos))
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})
Ejemplo n.º 4
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(out_dir, "qualimapReport.html")
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        export = utils.local_path_export()
        cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    return _parse_qualimap_metrics(report_file)
Ejemplo n.º 5
0
def _prep_subsampled_bams(data, work_dir):
    """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs.

    This attempts to minimize run times by pre-extracting useful reads mixed
    with subsampled normal pairs to estimate paired end distributions:

    https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ

    Subsamples correctly aligned reads to 100 million based on speedseq defaults and
    evaluations on NA12878 whole genome data:

    https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102

    XXX Currently does not downsample as new versions do not get good sensitivity with
    downsampled BAMs.
    """
    full_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
    return [full_bam]

    ds_bam = bam.downsample(full_bam, data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'",
                            always_run=True, work_dir=work_dir)
    out_bam = "%s-final%s" % utils.splitext_plus(ds_bam)
    if not utils.file_exists(out_bam):
        bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"])
    bam.index(out_bam, data["config"])
    return [out_bam]
Ejemplo n.º 6
0
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7)
                  if data.get("analysis", "").lower() not in ["standard"]
                  else None)
        bam_file = ds_bam if ds_bam else bam_file
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [config_utils.get_program("fastqc", data["config"]),
                      "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                fastqc_outdir = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0])
                if os.path.exists("%s.zip" % fastqc_outdir):
                    os.remove("%s.zip" % fastqc_outdir)
                if not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(fastqc_outdir, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
            os.remove(ds_bam)
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Ejemplo n.º 7
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(out_dir, "qualimapReport.html")
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    return _parse_qualimap_metrics(report_file)
Ejemplo n.º 8
0
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get(
            "analysis", "").lower() not in ["standard"] else None)
        bam_file = ds_bam if ds_bam else bam_file
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [
                    config_utils.get_program("fastqc", data["config"]), "-t",
                    str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file
                ]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                fastqc_outdir = os.path.join(
                    tx_tmp_dir, "%s_fastqc" %
                    os.path.splitext(os.path.basename(bam_file))[0])
                if os.path.exists("%s.zip" % fastqc_outdir):
                    os.remove("%s.zip" % fastqc_outdir)
                if not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(fastqc_outdir, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
            os.remove(ds_bam)
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Ejemplo n.º 9
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Ejemplo n.º 10
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {
        "firststrand": "strand-specific-forward",
        "secondstrand": "strand-specific-reverse",
        "unstranded": "non-strand-specific",
        "auto": "non-strand-specific"
    }

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    results_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    library = strandedness[dd.get_strandedness(data)]

    # don't run qualimap on the full bam by default
    if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data,
                                    []):
        logger.info(f"Full qualimap analysis for {bam_file} may be slow.")
        ds_bam = bam_file
    else:
        logger.info(f"Downsampling {bam_file} for Qualimap run.")
        ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
        bam_file = ds_bam if ds_bam else bam_file

    if not utils.file_exists(results_file):
        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir,
                                       gtf_file, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            tx_results_file = os.path.join(tx_results_dir,
                                           "rnaseq_qc_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data, results_dir))
    metrics.update(
        {"Average_insert_size": salmon.estimate_fragment_size(data)})
    metrics = _parse_metrics(metrics)
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file),
        "metrics": metrics
    }
Ejemplo n.º 11
0
def _prep_subsampled_bams(data, work_dir):
    """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs.

    This attempts to minimize run times by pre-extracting useful reads mixed
    with subsampled normal pairs to estimate paired end distributions:

    https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ

    Subsamples correctly aligned reads to 100 million based on speedseq defaults and
    evaluations on NA12878 whole genome data:

    https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102

    XXX Currently not used as new versions of delly do not get good sensitivity
    with downsampled BAMs.
    """
    sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir)
    ds_bam = bam.downsample(
        dd.get_align_bam(data),
        data,
        1e8,
        read_filter="-F 'not secondary_alignment and proper_pair'",
        always_run=True,
        work_dir=work_dir)
    out_bam = "%s-final%s" % utils.splitext_plus(ds_bam)
    if not utils.file_exists(out_bam):
        bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"])
    bam.index(out_bam, data["config"])
    return [out_bam]
Ejemplo n.º 12
0
def _run_qualimap(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    if not os.path.exists(report_file):
        ds_bam = bam.downsample(bam_file, data, 1e7)
        bam_file = ds_bam if ds_bam else bam_file
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        resources = config_utils.get_resources("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = (
            "unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
            "-nt {num_cores} --java-mem-size={max_mem}")
        species = tz.get_in(("genome_resources", "aliases", "ensembl"), data,
                            "")
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1])
    return _parse_qualimap_metrics(report_file)
Ejemplo n.º 13
0
def _run_qsignature_generator(bam_file, data, out_dir):
    """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary

    :param bam_file: (str) path of the bam_file
    :param data: (list) list containing the all the dictionary
                     for this sample
    :param out_dir: (str) path of the output

    :returns: (dict) dict with the normalize vcf file
    """
    position = dd.get_qsig_file(data)
    mixup_check = dd.get_mixup_check(data)
    if mixup_check and mixup_check.startswith("qsignature"):
        if not position:
            logger.info("There is no qsignature for this species: %s" %
                        tz.get_in(['genome_build'], data))
            return {}
        jvm_opts = "-Xms750m -Xmx2g"
        limit_reads = 20000000
        if mixup_check == "qsignature_full":
            slice_bam = bam_file
            jvm_opts = "-Xms750m -Xmx8g"
            limit_reads = 100000000
        else:
            slice_bam = _slice_chr22(bam_file, data)
        qsig = config_utils.get_program("qsignature", data["config"])
        if not qsig:
            return {}
        utils.safe_makedir(out_dir)
        out_name = os.path.basename(slice_bam).replace("bam", "qsig.vcf")
        out_file = os.path.join(out_dir, out_name)
        log_file = os.path.join(out_dir, "qsig.log")
        cores = dd.get_cores(data)
        base_cmd = ("{qsig} {jvm_opts} "
                    "org.qcmg.sig.SignatureGenerator "
                    "--noOfThreads {cores} "
                    "-log {log_file} -i {position} "
                    "-i {down_file} ")
        if not os.path.exists(out_file):
            down_file = bam.downsample(slice_bam, data, limit_reads)
            if not down_file:
                down_file = slice_bam
            file_qsign_out = "{0}.qsig.vcf".format(down_file)
            do.run(base_cmd.format(**locals()),
                   "qsignature vcf generation: %s" % data["name"][-1])
            if os.path.exists(file_qsign_out):
                with file_transaction(data, out_file) as file_txt_out:
                    shutil.move(file_qsign_out, file_txt_out)
            else:
                raise IOError("File doesn't exist %s" % file_qsign_out)
        return {'qsig_vcf': out_file}
    return {}
Ejemplo n.º 14
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)

        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        # Fixing the file name: MultiQC picks sample name from BAM file name.
        fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam")
        if not os.path.islink(fixed_bam_fname):
            os.symlink(bam_file, fixed_bam_fname)

        export = utils.local_path_export()
        cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} "
               "--skip-duplicated --skip-dup-mode 0 "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = None
        if tz.get_in(("genome_resources", "aliases", "human"), data, ""):
            species = "HUMAN"
        elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]):
            species = "MOUSE"
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    # return _parse_qualimap_metrics(report_file, data)
    return dict()
Ejemplo n.º 15
0
def _run_qsignature_generator(bam_file, data, out_dir):
    """ Run SignatureGenerator to create normalize vcf that later will be input of qsignature_summary

    :param bam_file: (str) path of the bam_file
    :param data: (list) list containing the all the dictionary
                     for this sample
    :param out_dir: (str) path of the output

    :returns: (dict) dict with the normalize vcf file
    """
    position = dd.get_qsig_file(data)
    mixup_check = dd.get_mixup_check(data)
    if mixup_check and mixup_check.startswith("qsignature"):
        if not position:
            logger.info("There is no qsignature for this species: %s"
                        % tz.get_in(['genome_build'], data))
            return {}
        jvm_opts = "-Xms750m -Xmx2g"
        limit_reads = 20000000
        if mixup_check == "qsignature_full":
            slice_bam = bam_file
            jvm_opts = "-Xms750m -Xmx8g"
            limit_reads = 100000000
        else:
            slice_bam = _slice_chr22(bam_file, data)
        qsig = config_utils.get_program("qsignature", data["config"])
        if not qsig:
            return {}
        utils.safe_makedir(out_dir)
        out_name = os.path.basename(slice_bam).replace("bam", "qsig.vcf")
        out_file = os.path.join(out_dir, out_name)
        log_file = os.path.join(out_dir, "qsig.log")
        cores = dd.get_cores(data)
        base_cmd = ("{qsig} {jvm_opts} "
                    "org.qcmg.sig.SignatureGenerator "
                    "--noOfThreads {cores} "
                    "-log {log_file} -i {position} "
                    "-i {down_file} ")
        if not os.path.exists(out_file):
            down_file = bam.downsample(slice_bam, data, limit_reads)
            if not down_file:
                down_file = slice_bam
            file_qsign_out = "{0}.qsig.vcf".format(down_file)
            do.run(base_cmd.format(**locals()), "qsignature vcf generation: %s" % data["name"][-1])
            if os.path.exists(file_qsign_out):
                with file_transaction(data, out_file) as file_txt_out:
                    shutil.move(file_qsign_out, file_txt_out)
            else:
                raise IOError("File doesn't exist %s" % file_qsign_out)
        return {'qsig_vcf': out_file}
    return {}
Ejemplo n.º 16
0
def process_lane(item):
    """Prepare lanes, potentially splitting based on barcodes and reducing the
    number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % item["rgnames"]["lane"])
    file1, file2 = get_fastq_files(item)
    if item.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE)
        else:
            file1, file2 = fastq.downsample(file1, file2, item, NUM_DOWNSAMPLE, quick=True)
    item["files"] = (file1, file2)
    return [item]
Ejemplo n.º 17
0
def run(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/smallRNA-seq/QC pipeline.

    Handles fastqc 0.11+, which use a single HTML file and older versions that use
    a directory of files + images. The goal is to eventually move to only 0.11+
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir)
                   if data.get("analysis", "").lower() not in ["standard", "smallrna-seq"]
                   else None)
        if ds_file is not None:
            bam_file = ds_file
        frmt = "bam" if bam_file.endswith("bam") else "fastq"
        fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0]
        fastqc_clean_name = dd.get_sample_name(data)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [config_utils.get_program("fastqc", data["config"]),
                      "-d", tx_tmp_dir,
                      "-t", str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt, bam_file]
                cl = "%s %s %s" % (utils.java_freetype_fix(),
                                   utils.local_path_export(), " ".join([str(x) for x in cl]))
                do.run(cl, "FastQC: %s" % dd.get_sample_name(data))
                tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name)
                tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name)
                if not os.path.exists(sentry_file) and os.path.exists(tx_combo_file):
                    utils.safe_makedir(fastqc_out)
                    # Use sample name for reports instead of bam file name
                    with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \
                            open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name:
                        for line in fastqc_bam_name:
                            fastqc_sample_name.write(line.replace(os.path.basename(bam_file), fastqc_clean_name))
                    shutil.move(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), os.path.join(fastqc_out, 'fastqc_data.txt'))
                    shutil.move(tx_combo_file, sentry_file)
                    if os.path.exists("%s.zip" % tx_fastqc_out):
                        shutil.move("%s.zip" % tx_fastqc_out, os.path.join(fastqc_out, "%s.zip" % fastqc_clean_name))
                elif not os.path.exists(sentry_file):
                    raise ValueError("FastQC failed to produce output HTML file: %s" % os.listdir(tx_tmp_dir))
    logger.info("Produced HTML report %s" % sentry_file)
    parser = FastQCParser(fastqc_out, dd.get_sample_name(data))
    stats = parser.get_fastqc_summary()
    parser.save_sections_into_file()
    return stats
Ejemplo n.º 18
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    file1, file2 = get_fastq_files(data)
    if data.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, data, NUM_DOWNSAMPLE)
            file2 = None
        else:
            file1, file2 = fastq.downsample(file1, file2, data, NUM_DOWNSAMPLE, quick=True)
    data["files"] = [file1, file2]
    return [[data]]
Ejemplo n.º 19
0
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/smallRNA-seq/QC pipeline.

    Handles fastqc 0.11+, which use a single HTML file and older versions that use
    a directory of files + images. The goal is to eventually move to only 0.11+
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get(
            "analysis", "").lower() not in ["standard", "smallrna-seq"] else
                  None)
        bam_file = ds_bam if ds_bam else bam_file
        frmt = "bam" if bam_file.endswith("bam") else "fastq"
        fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0]
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [
                    config_utils.get_program("fastqc", data["config"]), "-t",
                    str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt,
                    bam_file
                ]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                tx_fastqc_out = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % fastqc_name)
                tx_combo_file = os.path.join(tx_tmp_dir,
                                             "%s_fastqc.html" % fastqc_name)
                if os.path.exists("%s.zip" % tx_fastqc_out):
                    os.remove("%s.zip" % tx_fastqc_out)
                if not os.path.exists(sentry_file) and os.path.exists(
                        tx_combo_file):
                    utils.safe_makedir(fastqc_out)
                    shutil.move(os.path.join(tx_fastqc_out, "fastqc_data.txt"),
                                fastqc_out)
                    shutil.move(tx_combo_file, sentry_file)
                elif not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(tx_fastqc_out, fastqc_out)
    parser = FastQCParser(fastqc_out, data["name"][-1])
    stats = parser.get_fastqc_summary()
    parser.save_sections_into_file()
    return stats
Ejemplo n.º 20
0
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.

    Handles fastqc 0.11+, which use a single HTML file and older versions that use
    a directory of files + images. The goal is to eventually move to only 0.11+
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = bam.downsample(bam_file, data, 1e7) if data.get("analysis", "").lower() not in ["standard"] else None
        bam_file = ds_bam if ds_bam else bam_file
        fastqc_name = os.path.splitext(os.path.basename(bam_file))[0]
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [
                    config_utils.get_program("fastqc", data["config"]),
                    "-t",
                    str(num_cores),
                    "--extract",
                    "-o",
                    tx_tmp_dir,
                    "-f",
                    "bam",
                    bam_file,
                ]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                tx_fastqc_out = os.path.join(tx_tmp_dir, "%s_fastqc" % fastqc_name)
                tx_combo_file = os.path.join(tx_tmp_dir, "%s_fastqc.html" % fastqc_name)
                if os.path.exists("%s.zip" % tx_fastqc_out):
                    os.remove("%s.zip" % tx_fastqc_out)
                if not os.path.exists(sentry_file) and os.path.exists(tx_combo_file):
                    utils.safe_makedir(fastqc_out)
                    shutil.copy(os.path.join(tx_fastqc_out, "fastqc_data.txt"), fastqc_out)
                    shutil.move(tx_combo_file, sentry_file)
                elif not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(tx_fastqc_out, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
            os.remove(ds_bam)
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Ejemplo n.º 21
0
def process_lane(item):
    """Prepare lanes, potentially splitting based on barcodes and reducing the
    number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % item["rgnames"]["lane"])
    file1, file2 = get_fastq_files(item)
    if item.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE)
            file2 = None
        else:
            file1, file2 = fastq.downsample(file1, file2, item,
                                            NUM_DOWNSAMPLE, quick=True)
    item["files"] = [file1, file2]
    return [[item]]
Ejemplo n.º 22
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    file1, file2 = get_fastq_files(data)
    if data.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, data, NUM_DOWNSAMPLE)
            file2 = None
        else:
            file1, file2 = fastq.downsample(file1,
                                            file2,
                                            data,
                                            NUM_DOWNSAMPLE,
                                            quick=True)
    data["files"] = [file1, file2]
    return [[data]]
Ejemplo n.º 23
0
def _run_qualimap(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    report_file = os.path.join(out_dir, "qualimapReport.html")
    if not os.path.exists(report_file):
        ds_bam = bam.downsample(bam_file, data, 1e7)
        bam_file = ds_bam if ds_bam else bam_file
        utils.safe_makedir(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        resources = config_utils.get_resources("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)
        cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} "
               "-nt {num_cores} --java-mem-size={max_mem}")
        species = data["genome_resources"]["aliases"].get("ensembl", "").upper()
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1])
    return _parse_qualimap_metrics(report_file)