Example #1
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
            to_run.append("qualimap_rnaseq")
        else:
            logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        to_run.append("gemini")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
    return to_run
Example #2
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq"):
        if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
            to_run.append("qualimap_rnaseq")
        else:
            logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
    if not analysis.startswith("smallrna-seq"):
        to_run.append("samtools")
        to_run.append("gemini")
        if tz.get_in(["config", "algorithm", "kraken"], data):
            to_run.append("kraken")
    if analysis.startswith(("standard", "variant", "variant2")):
        to_run += ["qsignature", "coverage", "variants", "picard"]
    return to_run
Example #3
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    if data["analysis"].startswith("wgbs-seq"):
        bismark_bam = dd.get_align_bam(data)
        sorted_bam = bam.sort(bismark_bam, data["config"])
        data = dd.set_align_bam(data, sorted_bam)
        data = dd.set_work_bam(data, bismark_bam)
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if not work_bam or not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data):
        if work_bam or (tz.get_in(["config", "algorithm", "kraken"],
                                  data)):  # kraken doesn't need bam
            logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(
                dd.get_algorithm_qc(data))))
            work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data),
                                                 data)
            data["summary"] = _run_qc_tools(work_bam, work_data)
            if (len(dd.get_algorithm_qc(data)) == 1
                    and "output_cwl_keys" in data):
                data["summary"]["qc"] = data["summary"]["qc"].get(
                    dd.get_algorithm_qc(data)[0])
    return [[data]]
Example #4
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([
            tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]
    ]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
        if dd.get_chip_method(data) == "atac":
            to_run.append("ataqv")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if dd.has_variantcalls(data):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["peddy"]
            if "contamination" not in dd.get_tools_off(data):
                to_run += ["contamination"]
        if vcfutils.get_paired_phenotype(data):
            if "viral" not in dd.get_tools_off(data):
                to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run
Example #5
0
def parallel_multiplier(items):
    """Use more resources (up to available limits) if we have multiple QC samples/svcallers.
    """
    machines = []
    for data in (xs[0] for xs in items):
        machines.append(max(1, len(get_svcallers(data)), len(dd.get_algorithm_qc(data))))
    return sum(machines)
Example #6
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
Example #7
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "atropos": atropos.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "peddy": peddy.run_qc,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = utils.deepish_copy(dd.get_summary_qc(data))
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Example #8
0
def parallel_multiplier(items):
    """Use more resources (up to available limits) if we have multiple QC samples/svcallers.
    """
    machines = []
    for data in (xs[0] for xs in items):
        machines.append(max(1, len(get_svcallers(data)), len(dd.get_algorithm_qc(data))))
    return sum(machines)
Example #9
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data)
                or dd.get_work_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
Example #10
0
def get_qc_tools(data):
    """Retrieve a list of QC tools to use based on configuration and analysis type.

    Uses defaults if previously set.
    """
    if dd.get_algorithm_qc(data):
        return dd.get_algorithm_qc(data)
    analysis = data["analysis"].lower()
    to_run = []
    if tz.get_in(["config", "algorithm", "kraken"], data):
        to_run.append("kraken")
    if "fastqc" not in dd.get_tools_off(data):
        to_run.append("fastqc")
    if any([tool in dd.get_tools_on(data)
            for tool in ["qualimap", "qualimap_full"]]):
        to_run.append("qualimap")
    if analysis.startswith("rna-seq") or analysis == "smallrna-seq":
        if "qualimap" not in dd.get_tools_off(data):
            if gtf.is_qualimap_compatible(dd.get_gtf_file(data)):
                to_run.append("qualimap_rnaseq")
            else:
                logger.debug("GTF not compatible with Qualimap, skipping.")
    if analysis.startswith("chip-seq"):
        to_run.append("chipqc")
    if analysis.startswith("smallrna-seq"):
        to_run.append("small-rna")
        to_run.append("atropos")
    if "coverage_qc" not in dd.get_tools_off(data):
        to_run.append("samtools")
    if analysis.startswith(("standard", "variant", "variant2")):
        if "coverage_qc" not in dd.get_tools_off(data):
            to_run += ["coverage", "picard"]
        to_run += ["qsignature", "variants"]
        if vcfanno.is_human(data):
            to_run += ["contamination", "peddy"]
        if vcfutils.get_paired_phenotype(data):
            to_run += ["viral"]
        if damage.should_filter([data]):
            to_run += ["damage"]
    if dd.get_umi_consensus(data):
        to_run += ["umi"]
    if tz.get_in(["config", "algorithm", "preseq"], data):
        to_run.append("preseq")
    to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)]
    to_run.sort()
    return to_run
Example #11
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if not work_bam or not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data):
        if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)):  # kraken doesn't need bam
            logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data))))
            work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data)
            data["summary"] = _run_qc_tools(work_bam, work_data)
            if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data):
                data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0])
    return [[data]]
Example #12
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Example #13
0
def split_for_qc(data):
    """CWL: split an input sample into QC steps for parallel runs.
    """
    data = utils.to_single_data(data)
    to_analyze, extras = _split_samples_by_qc([data])
    out = []
    for data in to_analyze:
        data = utils.to_single_data(data)
        out.append({"cur_qc": dd.get_algorithm_qc(data)[0]})
    return out
Example #14
0
def remove_align_qc_tools(data):
    """Remove alignment based QC tools we don't need for data replicates.

    When we do multiple variant calling on a sample file (somatic/germline),
    avoid re-running QC.
    """
    align_qc = set(["qsignature", "coverage", "picard", "samtools", "fastqc"])
    data["config"]["algorithm"]["qc"] = [t for t in dd.get_algorithm_qc(data)
                                         if t not in align_qc]
    return data
Example #15
0
def remove_align_qc_tools(data):
    """Remove alignment based QC tools we don't need for data replicates.

    When we do multiple variant calling on a sample file (somatic/germline),
    avoid re-running QC.
    """
    align_qc = set(["qsignature", "coverage", "picard", "samtools", "fastqc"])
    data["config"]["algorithm"]["qc"] = [t for t in dd.get_algorithm_qc(data)
                                         if t not in align_qc]
    return data
Example #16
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    work_bam = data.get("align_bam")
    if data["analysis"].lower().startswith("smallrna-seq"):
        work_bam = data["clean_fastq"]
    elif data["analysis"].lower().startswith("chip-seq"):
        work_bam = data["raw_bam"]
    elif not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data) is not None and work_bam:
        logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data))))
        data["summary"] = _run_qc_tools(work_bam, data)
        if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data):
            data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0])
    return [[data]]
Example #17
0
def _get_input_files(samples, base_dir, tx_out_dir):
    """Retrieve input files, keyed by sample and QC method name.

    Stages files into the work directory to ensure correct names for
    MultiQC sample assessment when running with CWL.
    """
    in_files = collections.defaultdict(list)
    for data in samples:
        sum_qc = tz.get_in(["summary", "qc"], data, {})
        if sum_qc in [None, "None"]:
            sum_qc = {}
        elif isinstance(sum_qc, six.string_types):
            sum_qc = {dd.get_algorithm_qc(data)[0]: sum_qc}
        elif not isinstance(sum_qc, dict):
            raise ValueError("Unexpected summary qc: %s" % sum_qc)
        for program, pfiles in sum_qc.items():
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles.get("secondary", [])
            # CWL: presents output files as single file plus associated secondary files
            elif isinstance(pfiles, six.string_types):
                if os.path.exists(pfiles):
                    pfiles = [
                        os.path.join(basedir, f)
                        for basedir, subdir, filenames in os.walk(
                            os.path.dirname(pfiles)) for f in filenames
                    ]
                else:
                    pfiles = []
            in_files[(dd.get_sample_name(data), program)].extend(pfiles)
    staged_files = []
    for (sample, program), files in in_files.items():
        cur_dir = utils.safe_makedir(
            os.path.join(base_dir, "inputs", sample, program))
        for f in files:
            if _check_multiqc_input(f) and _is_good_file_for_multiqc(f):
                if _in_temp_directory(f) or any(
                    [cwlutils.is_cwl_run(d) for d in samples]):
                    staged_f = os.path.join(cur_dir, os.path.basename(f))
                    shutil.copy(f, staged_f)
                    staged_files.append(staged_f)
                else:
                    staged_files.append(f)
    staged_files.extend(get_qsig_multiqc_files(samples))
    # Back compatible -- to migrate to explicit specifications in input YAML
    if not any([cwlutils.is_cwl_run(d) for d in samples]):
        staged_files += ["trimmed", "htseq-count/*summary"]
        # Add in created target_info file
        if os.path.isfile(
                os.path.join(base_dir, "report", "metrics",
                             "target_info.yaml")):
            staged_files += [
                os.path.join(base_dir, "report", "metrics", "target_info.yaml")
            ]
    return sorted(list(set(staged_files)))
Example #18
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature,
                          qualimap, samtools, picard, srna, umi, variant,
                          viral)
    tools = {
        "fastqc": fastqc.run,
        "small-rna": srna.run,
        "samtools": samtools.run,
        "qualimap": qualimap.run,
        "qualimap_rnaseq": qualimap.run_rnaseq,
        "qsignature": qsignature.run,
        "coverage": coverage.run,
        "damage": damage.run,
        "variants": variant.run,
        "kraken": kraken.run,
        "picard": picard.run,
        "umi": umi.run,
        "viral": viral.run
    }
    qc_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in dd.get_algorithm_qc(data):
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            if "base" in out:
                if "metrics" in out:
                    metrics.update(out.pop("metrics"))
                qc_files = out
            else:
                metrics.update(out)
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Example #19
0
def pipeline_summary(data):
    """Provide summary information on processing sample.
    """
    data = utils.to_single_data(data)
    work_bam = data.get("align_bam")
    if dd.get_ref_file(data) is not None and work_bam and work_bam.endswith(".bam"):
        logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data))))
        if data["analysis"].lower().startswith("smallrna-seq"):
            work_bam = data["clean_fastq"]
        elif data["analysis"].lower().startswith("chip-seq"):
            work_bam = data["raw_bam"]
        data["summary"] = _run_qc_tools(work_bam, data)
    return [[data]]
Example #20
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    work_bam = data.get("align_bam")
    if data["analysis"].lower().startswith("smallrna-seq"):
        work_bam = data["clean_fastq"]
    elif data["analysis"].lower().startswith("chip-seq"):
        work_bam = data["raw_bam"]
    elif not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data) is not None and work_bam:
        logger.info(
            "QC: %s %s" %
            (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data))))
        data["summary"] = _run_qc_tools(work_bam, data)
        if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data):
            data["summary"]["qc"] = data["summary"]["qc"].get(
                dd.get_algorithm_qc(data)[0])
    return [[data]]
Example #21
0
def _split_samples_by_qc(samples):
    """Split data into individual quality control steps for a run.
    """
    to_process = []
    extras = []
    for data in [utils.to_single_data(x) for x in samples]:
        qcs = dd.get_algorithm_qc(data)
        if not dd.get_align_bam(data) or not qcs:
            extras.append([data])
        else:
            for qc in qcs:
                add = copy.deepcopy(data)
                add["config"]["algorithm"]["qc"] = [qc]
                to_process.append([add])
    return to_process, extras
Example #22
0
def _split_samples_by_qc(samples):
    """Split data into individual quality control steps for a run.
    """
    to_process = []
    extras = []
    for data in [utils.to_single_data(x) for x in samples]:
        qcs = dd.get_algorithm_qc(data)
        if not dd.get_align_bam(data) or not qcs:
            extras.append([data])
        else:
            for qc in qcs:
                add = copy.deepcopy(data)
                add["config"]["algorithm"]["qc"] = [qc]
                to_process.append([add])
    return to_process, extras
Example #23
0
def _split_samples_by_qc(samples):
    """Split data into individual quality control steps for a run.
    """
    to_process = []
    extras = []
    for data in [utils.to_single_data(x) for x in samples]:
        qcs = dd.get_algorithm_qc(data)
        # kraken doesn't need bam
        if qcs and (dd.get_align_bam(data) or dd.get_work_bam(data)
                    or tz.get_in(["config", "algorithm", "kraken"], data)):
            for qc in qcs:
                add = copy.deepcopy(data)
                add["config"]["algorithm"]["qc"] = [qc]
                to_process.append([add])
        else:
            extras.append([data])
    return to_process, extras
Example #24
0
def _split_samples_by_qc(samples):
    """Split data into individual quality control steps for a run.
    """
    to_process = []
    extras = []
    for data in [utils.to_single_data(x) for x in samples]:
        qcs = dd.get_algorithm_qc(data)
        # kraken doesn't need bam
        if qcs and (dd.get_align_bam(data) or dd.get_work_bam(data) or
                    tz.get_in(["config", "algorithm", "kraken"], data)):
            for qc in qcs:
                add = copy.deepcopy(data)
                add["config"]["algorithm"]["qc"] = [qc]
                to_process.append([add])
        else:
            extras.append([data])
    return to_process, extras
Example #25
0
def _get_input_files(samples, base_dir, tx_out_dir):
    """Retrieve input files, keyed by sample and QC method name.

    Stages files into the work directory to ensure correct names for
    MultiQC sample assessment when running with CWL.
    """
    in_files = collections.defaultdict(list)
    for data in samples:
        sum_qc = tz.get_in(["summary", "qc"], data, {})
        if sum_qc in [None, "None"]:
            sum_qc = {}
        elif isinstance(sum_qc, six.string_types):
            sum_qc = {dd.get_algorithm_qc(data)[0]: sum_qc}
        elif not isinstance(sum_qc, dict):
            raise ValueError("Unexpected summary qc: %s" % sum_qc)
        for program, pfiles in sum_qc.items():
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles.get("secondary", [])
            # CWL: presents output files as single file plus associated secondary files
            elif isinstance(pfiles, six.string_types):
                if os.path.exists(pfiles):
                    pfiles = [os.path.join(basedir, f) for basedir, subdir, filenames in os.walk(os.path.dirname(pfiles)) for f in filenames]
                else:
                    pfiles = []
            in_files[(dd.get_sample_name(data), program)].extend(pfiles)
    staged_files = []
    for (sample, program), files in in_files.items():
        cur_dir = utils.safe_makedir(os.path.join(base_dir, "inputs", sample, program))
        for f in files:
            if _check_multiqc_input(f) and _is_good_file_for_multiqc(f):
                if _in_temp_directory(f) or any([cwlutils.is_cwl_run(d) for d in samples]):
                    staged_f = os.path.join(cur_dir, os.path.basename(f))
                    shutil.copy(f, staged_f)
                    staged_files.append(staged_f)
                else:
                    staged_files.append(f)
    staged_files.extend(get_qsig_multiqc_files(samples))
    # Back compatible -- to migrate to explicit specifications in input YAML
    if not any([cwlutils.is_cwl_run(d) for d in samples]):
        staged_files += ["trimmed", "htseq-count/*summary"]
        # Add in created target_info file
        if os.path.isfile(os.path.join(base_dir, "report", "metrics", "target_info.yaml")):
            staged_files += [os.path.join(base_dir, "report", "metrics", "target_info.yaml")]
    return sorted(list(set(staged_files)))
Example #26
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant
    tools = {"fastqc": fastqc.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": _run_coverage_qc,
             "variants": variant.run,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run}
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in dd.get_algorithm_qc(data):
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            if "base" in out:
                qc_files = out
            else:
                metrics.update(out)
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}