Exemple #1
0
def _merge_hla_fastq_inputs(data):
    """Merge HLA inputs from a split initial alignment.
    """
    hla_key = ["hla", "fastq"]
    hla_sample_files = [
        x for x in (tz.get_in(hla_key, data) or []) if x and x != "None"
    ]
    merged_hlas = None
    if hla_sample_files:
        out_files = collections.defaultdict(list)
        for hla_file in utils.flatten(hla_sample_files):
            rehla = re.search(r".hla.(?P<hlatype>[\w-]+).fq", hla_file)
            if rehla:
                hlatype = rehla.group("hlatype")
                out_files[hlatype].append(hla_file)
        if len(out_files) > 0:
            hla_outdir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), "align",
                             dd.get_sample_name(data), "hla"))
            merged_hlas = []
            for hlatype, files in out_files.items():
                out_file = os.path.join(
                    hla_outdir,
                    "%s-%s.fq" % (dd.get_sample_name(data), hlatype))
                optitype.combine_hla_fqs([(hlatype, f) for f in files],
                                         out_file, data)
                merged_hlas.append(out_file)
    data = tz.update_in(data, hla_key, lambda x: merged_hlas)
    return data
Exemple #2
0
def _get_output_cwl_keys(fnargs):
    """Retrieve output_cwl_keys from potentially nested input arguments.
    """
    for d in utils.flatten(fnargs):
        if isinstance(d, dict) and d.get("output_cwl_keys"):
            return d["output_cwl_keys"]
    raise ValueError("Did not find output_cwl_keys in %s" % (pprint.pformat(fnargs)))
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] +
                                              dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(
                        ("%s-" % test_name, "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Exemple #4
0
def summarize_sv(items):
    """CWL target: summarize structural variants for multiple samples.

    XXX Need to support non-VCF output as tabix indexed output
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    out = {"sv": {"calls": []}}
    added = set([])
    for data in items:
        if data.get("sv"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"])
            if data["sv"].get("vrn_file"):
                ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1]
                if cur_name not in added and ext.startswith(".vcf"):
                    added.add(cur_name)
                    out_file = os.path.join(
                        utils.safe_makedir(
                            os.path.join(dd.get_work_dir(data), "sv",
                                         "calls")), "%s%s" % (cur_name, ext))
                    utils.copy_plus(data["sv"]["vrn_file"], out_file)
                    out_file = vcfutils.bgzip_and_index(
                        out_file, data["config"])
                    out["sv"]["calls"].append(out_file)
    return [out]
Exemple #5
0
def _write_wdl_outputs(argfile, out_keys):
    """Write variables as WDL compatible output files.

    Writes individual files prefixed with 'wdl.output' that can be read
    by WDL standard library functions:

    https://github.com/broadinstitute/wdl/blob/develop/SPEC.md#outputs
    """
    out_basename = "wdl.output.%s.txt"
    with open(argfile) as in_handle:
        outputs = json.load(in_handle)
    record_name, record_attrs = _get_record_attrs(out_keys)
    if record_name:
        recs = outputs[record_name]
        with open(out_basename % record_name, "w") as out_handle:
            writer = csv.writer(out_handle)
            if not isinstance(recs, (list, tuple)):
                recs = [recs]
            recs = list(utils.flatten(recs))
            keys = sorted(list(set(reduce(operator.add, [r.keys() for r in recs]))))
            writer.writerow(keys)
            for rec in recs:
                writer.writerow([_cwlvar_to_wdl(rec.get(k)) for k in keys])
    else:
        for key in out_keys:
            with open(out_basename % key, "w") as out_handle:
                vals = _cwlvar_to_wdl(outputs.get(key))
                if not isinstance(vals, (list, tuple)):
                    vals = [vals]
                for val in vals:
                    if isinstance(val, (list, tuple)):
                        val = "\t".join([str(x) for x in val])
                    out_handle.write(str(val) + "\n")
Exemple #6
0
def _get_output_cwl_keys(fnargs):
    """Retrieve output_cwl_keys from potentially nested input arguments.
    """
    for d in utils.flatten(fnargs):
        if isinstance(d, dict) and d.get("output_cwl_keys"):
            return d["output_cwl_keys"]
    raise ValueError("Did not find output_cwl_keys in %s" % (pprint.pformat(fnargs)))
Exemple #7
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] +
                                              dd.get_batches(data)):
                    if os.path.basename(f).startswith(
                        ("%s-" % test_name, "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Exemple #8
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(("%s-" % test_name,
                                                       "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Exemple #9
0
def _write_wdl_outputs(argfile, out_keys):
    """Write variables as WDL compatible output files.

    Writes individual files prefixed with 'wdl.output' that can be read
    by WDL standard library functions:

    https://github.com/broadinstitute/wdl/blob/develop/SPEC.md#outputs
    """
    out_basename = "wdl.output.%s.txt"
    with open(argfile) as in_handle:
        outputs = json.load(in_handle)
    record_name, record_attrs = _get_record_attrs(out_keys)
    if record_name:
        recs = outputs[record_name]
        with open(out_basename % record_name, "w") as out_handle:
            writer = csv.writer(out_handle)
            if not isinstance(recs, (list, tuple)):
                recs = [recs]
            recs = list(utils.flatten(recs))
            keys = sorted(list(set(reduce(operator.add, [r.keys() for r in recs]))))
            writer.writerow(keys)
            for rec in recs:
                writer.writerow([_cwlvar_to_wdl(rec.get(k)) for k in keys])
    else:
        for key in out_keys:
            with open(out_basename % key, "w") as out_handle:
                vals = _cwlvar_to_wdl(outputs.get(key))
                if not isinstance(vals, (list, tuple)):
                    vals = [vals]
                for val in vals:
                    if isinstance(val, (list, tuple)):
                        val = "\t".join([str(x) for x in val])
                    out_handle.write(str(val) + "\n")
Exemple #10
0
def summarize_grading(samples, vkey="validate"):
    """Provide summaries of grading results across all samples.

    Handles both traditional pipelines (validation part of variants) and CWL
    pipelines (validation at top level)
    """
    samples = list(utils.flatten(samples))
    if not _has_grading_info(samples, vkey):
        return [[d] for d in samples]
    validate_dir = utils.safe_makedir(
        os.path.join(samples[0]["dirs"]["work"], vkey))
    header = ["sample", "caller", "variant.type", "category", "value"]
    validated, out = _group_validate_samples(samples, vkey)
    for vname, vitems in validated.items():
        out_csv = os.path.join(validate_dir, "grading-summary-%s.csv" % vname)
        with open(out_csv, "w") as out_handle:
            writer = csv.writer(out_handle)
            writer.writerow(header)
            plot_data = []
            plot_files = []
            for data in sorted(
                    vitems,
                    key=lambda x: x.get("lane", dd.get_sample_name(x))):
                validations = [
                    variant.get(vkey) for variant in data.get("variants", [])
                ]
                validations = [v for v in validations if v]
                if len(validations) == 0 and vkey in data:
                    validations = [data.get(vkey)]
                for validate in validations:
                    if validate:
                        validate["grading_summary"] = out_csv
                        if validate.get("grading"):
                            for row in _get_validate_plotdata_yaml(
                                    validate["grading"], data):
                                writer.writerow(row)
                                plot_data.append(row)
                        elif validate.get("summary") and not validate.get(
                                "summary") == "None":
                            if isinstance(validate["summary"], (list, tuple)):
                                plot_files.extend(
                                    list(set(validate["summary"])))
                            else:
                                plot_files.append(validate["summary"])
        if plot_files:
            plots = validateplot.classifyplot_from_plotfiles(
                plot_files, out_csv)
        elif plot_data:
            plots = validateplot.create(plot_data, header, 0, data["config"],
                                        os.path.splitext(out_csv)[0])
        else:
            plots = []
        for data in vitems:
            if data.get(vkey):
                data[vkey]["grading_plots"] = plots
            for variant in data.get("variants", []):
                if variant.get(vkey):
                    variant[vkey]["grading_plots"] = plots
            out.append([data])
    return out
Exemple #11
0
 def test_disambiguate(self):
     in_files = self.config["input_bamdiff"]
     disambiguate = sam.Disambiguate(self.config)
     output = list(flatten(disambiguate(in_files)))
     out_md5 = map(self._get_md5, output)
     correct_files = self._correct_files(output)
     correct_md5 = map(self._get_md5, correct_files)
     self.assertTrue(out_md5 == correct_md5)
Exemple #12
0
def _get_vcf_samples(calls):
    all_samples = set([])
    for f in utils.flatten(calls):
        cur = set(vcfutils.get_samples(f))
        if cur:
            if not all_samples:
                all_samples = cur
            else:
                all_samples &= set(cur)
    return list(all_samples)
Exemple #13
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples]
    work_samples = _report_summary(work_samples, os.path.join(out_dir, "report"))
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final)

        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final)

    return [[data] for data in samples]
def summarize_grading(samples, vkey="validate"):
    """Provide summaries of grading results across all samples.

    Handles both traditional pipelines (validation part of variants) and CWL
    pipelines (validation at top level)
    """
    samples = list(utils.flatten(samples))
    if not _has_grading_info(samples, vkey):
        return [[d] for d in samples]
    validate_dir = utils.safe_makedir(os.path.join(samples[0]["dirs"]["work"], vkey))
    header = ["sample", "caller", "variant.type", "category", "value"]
    _summarize_combined(samples, vkey)
    validated, out = _group_validate_samples(samples, vkey,
                                             (["metadata", "validate_batch"], ["metadata", "batch"], ["description"]))
    for vname, vitems in validated.items():
        out_csv = os.path.join(validate_dir, "grading-summary-%s.csv" % vname)
        with open(out_csv, "w") as out_handle:
            writer = csv.writer(out_handle)
            writer.writerow(header)
            plot_data = []
            plot_files = []
            for data in sorted(vitems, key=lambda x: x.get("lane", dd.get_sample_name(x)) or ""):
                validations = [variant.get(vkey) for variant in data.get("variants", [])
                               if isinstance(variant, dict)]
                validations = [v for v in validations if v]
                if len(validations) == 0 and vkey in data:
                    validations = [data.get(vkey)]
                for validate in validations:
                    if validate:
                        validate["grading_summary"] = out_csv
                        if validate.get("grading"):
                            for row in _get_validate_plotdata_yaml(validate["grading"], data):
                                writer.writerow(row)
                                plot_data.append(row)
                        elif validate.get("summary") and not validate.get("summary") == "None":
                            if isinstance(validate["summary"], (list, tuple)):
                                plot_files.extend(list(set(validate["summary"])))
                            else:
                                plot_files.append(validate["summary"])
        if plot_files:
            plots = validateplot.classifyplot_from_plotfiles(plot_files, out_csv)
        elif plot_data:
            plots = validateplot.create(plot_data, header, 0, data["config"],
                                        os.path.splitext(out_csv)[0])
        else:
            plots = []
        for data in vitems:
            if data.get(vkey):
                data[vkey]["grading_plots"] = plots
            for variant in data.get("variants", []):
                if isinstance(variant, dict) and variant.get(vkey):
                    variant[vkey]["grading_plots"] = plots
            out.append([data])
    return out
Exemple #15
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {
        "validate": validate.combine_validations(items),
        "variants": {
            "calls": [],
            "gvcf": [],
            "samples": []
        }
    }
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(
                data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(
                    utils.safe_makedir(
                        os.path.join(dd.get_work_dir(data), "variants",
                                     out_key)), "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
Exemple #16
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples]
    work_samples = _report_summary(work_samples, os.path.join(out_dir, "report"))
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    out = []
    for i, data in enumerate(_group_by_samplename(samples)):
        if i == 0:
            if utils.file_exists(out_file):
                data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*.R*"))
                data_files += glob.glob(os.path.join(out_dir, "multiqc_config.yaml"))
                data_files.append(file_list)
                if "summary" not in data:
                    data["summary"] = {}
                data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}
                file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
                if file_list_final:
                    data["summary"]["multiqc"]["secondary"].append(file_list_final)
        out.append([data])
    return out
Exemple #17
0
def stringtie_merge(*samples):
    to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in
                                       dd.sample_data_iterator(samples)]))
    data = samples[0][0]
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Exemple #18
0
def stringtie_merge(*samples):
    to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in
                                       dd.sample_data_iterator(samples)]))
    data = samples[0][0]
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Exemple #19
0
def _get_vcf_samples(calls, data):
    have_full_file = False
    all_samples = set([])
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for test_name in [dd.get_sample_name(data)] + dd.get_batches(data):
                if os.path.basename(f).startswith("%s-" % test_name):
                    all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Exemple #20
0
def cufflinks_merge(*samples):
    to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in
                                       dd.sample_data_iterator(samples)]))
    data = samples[0][0]
    bam_file = dd.get_work_bam(data)
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    out_dir = os.path.join(dd.get_work_dir(data), "assembly")
    num_cores = dd.get_num_cores(data)
    merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores,
                                 samples[0][0])
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Exemple #21
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in utils.flatten(items)]
    items = [_normalize_vc_input(x) for x in items]
    items = validate.summarize_grading(items)
    items = [utils.to_single_data(x) for x in items]
    out = {"validate": validate.combine_validations(items),
           "variants": {"calls": [], "gvcf": [], "samples": []}}
    added = set([])
    variants_by_sample = collections.defaultdict(list)
    sample_order = []
    for data in items:
        batch_samples = data.get("batch_samples", [dd.get_sample_name(data)])
        for s in batch_samples:
            if s not in sample_order:
                sample_order.append(s)
        if data.get("vrn_file"):
            # Only get batches if we're actually doing variantcalling in bcbio
            # otherwise we'll be using the original files
            names = dd.get_batches(data) if dd.get_variantcaller(data) else None
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                        "variants", out_key)),
                                        "%s.vcf.gz" % cur_name)
                for s in batch_samples:
                    variants_by_sample[s].append(out_file)
                if cur_name not in added:
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    for sample in sample_order:
        out["variants"]["samples"].append(variants_by_sample[sample])
    return [out]
def _merge_metadata(samples):
    """Merge all metadata into CSV file"""
    samples = list(utils.flatten(samples))
    out_dir = dd.get_work_dir(samples[0])
    logger.info("summarize metadata")
    out_file = os.path.join(out_dir, "metadata.csv")
    sample_metrics = collections.defaultdict(dict)
    for s in samples:
        m = tz.get_in(['metadata'], s)
        if isinstance(m, six.string_types):
            m = json.loads(m)
        if m:
            for me in list(m.keys()):
                if isinstance(m[me], list) or isinstance(m[me], dict) or isinstance(m[me], tuple):
                    m.pop(me, None)
            sample_metrics[dd.get_sample_name(s)].update(m)
    pd.DataFrame(sample_metrics).transpose().to_csv(out_file)
    return out_file
Exemple #23
0
def _merge_metadata(samples):
    """Merge all metadata into CSV file"""
    samples = list(utils.flatten(samples))
    out_dir = dd.get_work_dir(samples[0])
    logger.info("summarize metadata")
    out_file = os.path.join(out_dir, "metadata.csv")
    sample_metrics = collections.defaultdict(dict)
    for s in samples:
        m = tz.get_in(['metadata'], s)
        if isinstance(m, six.string_types):
            m = json.loads(m)
        if m:
            for me in list(m.keys()):
                if isinstance(m[me], list) or isinstance(m[me], dict) or isinstance(m[me], tuple):
                    m.pop(me, None)
            sample_metrics[dd.get_sample_name(s)].update(m)
    pd.DataFrame(sample_metrics).transpose().to_csv(out_file)
    return out_file
Exemple #24
0
def _get_coverage_per_region(name):
    """
    Parse coverage file if it exists to get average value.
    """
    fns = tz.get_in(["summary", "qc", "coverage"], name, {})
    if fns:
        fns = utils.flatten(fns.values())
        fn = [fn for fn in fns if fn.find("coverage_fixed.bed") > -1]
        if fn:
            fn = fn[0]
            if utils.file_exists(fn):
                logger.debug("Reading meanCoverage for: %s" % fn)
                try:
                    dt = pd.read_csv(fn, sep="\t", index_col=False)
                    if "meanCoverage" in dt:
                        if len(dt["meanCoverage"]) > 0:
                            return "%.3f" % (sum(map(float, dt['meanCoverage'])) / len(dt['meanCoverage']))
                except TypeError:
                    logger.debug("%s has no lines in coverage.bed" % name)
    return "NA"
Exemple #25
0
def _get_coverage_per_region(name):
    """
    Parse coverage file if it exists to get average value.
    """
    fns = tz.get_in(["summary", "qc", "coverage"], name, {})
    if fns:
        fns = utils.flatten(fns.values())
        fn = [fn for fn in fns if fn.find("coverage_fixed.bed") > -1]
        if fn:
            fn = fn[0]
            if utils.file_exists(fn):
                logger.debug("Reading meanCoverage for: %s" % fn)
                try:
                    dt = pd.read_csv(fn, sep="\t", index_col=False)
                    if "meanCoverage" in dt:
                        if len(dt["meanCoverage"]) > 0:
                            return "%.3f" % (
                                sum(map(float, dt['meanCoverage'])) /
                                len(dt['meanCoverage']))
                except TypeError:
                    logger.debug("%s has no lines in coverage.bed" % name)
    return "NA"
Exemple #26
0
def main(config_file, fastq_dir):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    barcode_info = config["barcodes"]
    print "Processing %s." % (fastq_dir)
    in_files = glob.glob(os.path.join(fastq_dir, "*.fastq"))
    print "Found %s in %s. " % (in_files, fastq_dir)
    print "Combining paired-end files, if found."
    pairs = combine_pairs(in_files)
    print "Calulcated pairs: %s." % (pairs)
    out_files = []
    for pair in pairs:
        barcode = _determine_barcode_from_filename(pair[0])
        print "Detected barcode: %s" % barcode
        if barcode not in barcode_info.keys():
            print "barcode %s not found in the YAML file, skipping." % (
                barcode)
            continue
        print "Sample ID: %s" % (barcode_info[barcode][0])
        type = barcode_info[barcode][1]
        print "Sample type: %s" % (barcode_info[barcode][1])
        to_trim = config["to_trim"][type]
        cutadapt_dir = "cutadapt"
        print("Trimming off %s and any bases before it from %s." %
              (to_trim[0], pair[0]))
        out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0]))
        out_files.append(_trim_from_front(pair[0], to_trim[0]))
        if len(pair) > 1:
            print("Trimming off %s and any bases before it from %s." %
                  (to_trim[1], pair[1]))
            out_files.append(_trim_from_front(pair[1], to_trim[1]))
    out_files = list(flatten(out_files))
    out_files = combine_pairs(out_files)
    for pair in out_files:
        if len(pair) > 1:
            filter_reads_by_length(pair[0], pair[1], "fastq-sanger")
        else:
            filter_single_reads_by_length(pair[0], "fastq-sanger")
Exemple #27
0
def main(config_file, fastq_dir):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    barcode_info = config["barcodes"]
    print "Processing %s." % (fastq_dir)
    in_files = glob.glob(os.path.join(fastq_dir, "*.fastq"))
    print "Found %s in %s. " % (in_files, fastq_dir)
    print "Combining paired-end files, if found."
    pairs = combine_pairs(in_files)
    print "Calulcated pairs: %s." % (pairs)
    out_files = []
    for pair in pairs:
        barcode = _determine_barcode_from_filename(pair[0])
        print "Detected barcode: %s" % barcode
        if barcode not in barcode_info.keys():
            print "barcode %s not found in the YAML file, skipping." % (barcode)
            continue
        print "Sample ID: %s" % (barcode_info[barcode][0])
        type = barcode_info[barcode][1]
        print "Sample type: %s" % (barcode_info[barcode][1])
        to_trim = config["to_trim"][type]
        cutadapt_dir = "cutadapt"
        print ("Trimming off %s and any bases before it from %s."
               % (to_trim[0], pair[0]))
        out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0]))
        out_files.append(_trim_from_front(pair[0], to_trim[0]))
        if len(pair) > 1:
            print ("Trimming off %s and any bases before it from %s."
                   % (to_trim[1], pair[1]))
            out_files.append(_trim_from_front(pair[1], to_trim[1]))
    out_files = list(flatten(out_files))
    out_files = combine_pairs(out_files)
    for pair in out_files:
        if len(pair) > 1:
            filter_reads_by_length(pair[0], pair[1], "fastq-sanger")
        else:
            filter_single_reads_by_length(pair[0], "fastq-sanger")
Exemple #28
0
def _merge_hla_fastq_inputs(data):
    """Merge HLA inputs from a split initial alignment.
    """
    hla_key = ["hla", "fastq"]
    hla_sample_files = [x for x in (tz.get_in(hla_key, data) or []) if x and x != "None"]
    merged_hlas = None
    if hla_sample_files:
        out_files = collections.defaultdict(list)
        for hla_file in utils.flatten(hla_sample_files):
            rehla = re.search(r".hla.(?P<hlatype>[\w-]+).fq", hla_file)
            if rehla:
                hlatype = rehla.group("hlatype")
                out_files[hlatype].append(hla_file)
        if len(out_files) > 0:
            hla_outdir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                         dd.get_sample_name(data), "hla"))
            merged_hlas = []
            for hlatype, files in out_files.items():
                out_file = os.path.join(hla_outdir, "%s-%s.fq" % (dd.get_sample_name(data), hlatype))
                optitype.combine_hla_fqs([(hlatype, f) for f in files], out_file, data)
                merged_hlas.append(out_file)
    data = tz.update_in(data, hla_key, lambda x: merged_hlas)
    return data
def update_summary_qc(data, key, base=None, secondary=None):
    """
    updates summary_qc, keyed by key. key is generally the program the quality
    control metrics came from. if key already exists, the specified
    base/secondary files are added as secondary files to the existing
    key, removing duplicates.

    stick files into summary_qc if you want them propagated forward
    and available for multiqc
    """
    summary = deepish_copy(get_summary_qc(data, {}))
    files = [[base], [secondary],
             tz.get_in([key, "base"], summary, []),
             tz.get_in([key, "secondary"], summary, [])]
    files = list(set([x for x in flatten(files) if x]))
    base = tz.first(files)
    secondary = list(tz.drop(1, files))
    if base and secondary:
        summary[key] = {"base": base, "secondary": secondary}
    elif base:
        summary[key] = {"base": base}
    data = set_summary_qc(data, summary)
    return data
Exemple #30
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0])
    work_samples = _summarize_inputs(work_samples, out_dir)
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    locale_export = utils.locale_export()
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = ("{path_export}{export_tmp}{locale_export} "
                           "{multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}")
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))]
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final)

        # Prepare final file list and inputs for downstream usage
        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final)
            if any([cwlutils.is_cwl_run(d) for d in samples]):
                for indir in ["inputs", "report"]:
                    tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir))
                    if not utils.file_exists(tarball):
                        with utils.chdir(out_dir):
                            cmd = ["tar", "-czvpf", tarball, indir]
                            do.run(cmd, "Compress multiqc inputs: %s" % indir)
                    samples[0]["summary"]["multiqc"]["secondary"].append(tarball)

    if any([cwlutils.is_cwl_run(d) for d in samples]):
        samples = _add_versions(samples)

    return [[data] for data in samples]
Exemple #31
0
def _get_rv_adapters(data):
    builtin = [RV_ADAPTERS[x] for x in dd.get_adapters(data) if x in FW_ADAPTERS]
    return flatten(builtin + dd.get_custom_trim(data))
Exemple #32
0
def _get_rv_adapters(data):
    builtin = [
        RV_ADAPTERS[x] for x in dd.get_adapters(data) if x in FW_ADAPTERS
    ]
    return flatten(builtin + dd.get_custom_trim(data))
Exemple #33
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug(
            "multiqc not found. Update bcbio_nextgen.py tools to fix this issue."
        )
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = [
        cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples
    ]
    work_samples = _report_summary(work_samples,
                                   os.path.join(out_dir, "report"))
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(
                            samples[0])
                    else:
                        export_tmp = ""
                    path_export = utils.local_path_export()
                    cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out}"
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(
                            os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(
                            os.path.join(tx_out, "multiqc_report.html"),
                            out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"),
                                    out_data)
    out = []
    for i, data in enumerate(_group_by_samplename(samples)):
        if i == 0:
            if utils.file_exists(out_file):
                data_files = glob.glob(
                    os.path.join(out_dir, "multiqc_data", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.bed"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.tsv"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.yaml"))
                data_files += glob.glob(os.path.join(out_dir, "report",
                                                     "*.R*"))
                data_files.append(file_list)
                if "summary" not in data:
                    data["summary"] = {}
                data["summary"]["multiqc"] = {
                    "base": out_file,
                    "secondary": data_files
                }
                file_list_final = _save_uploaded_file_list(
                    samples, file_list, out_dir)
                if file_list_final:
                    data["summary"]["multiqc"]["secondary"].append(
                        file_list_final)
        out.append([data])
    return out