def _merge_hla_fastq_inputs(data): """Merge HLA inputs from a split initial alignment. """ hla_key = ["hla", "fastq"] hla_sample_files = [ x for x in (tz.get_in(hla_key, data) or []) if x and x != "None" ] merged_hlas = None if hla_sample_files: out_files = collections.defaultdict(list) for hla_file in utils.flatten(hla_sample_files): rehla = re.search(r".hla.(?P<hlatype>[\w-]+).fq", hla_file) if rehla: hlatype = rehla.group("hlatype") out_files[hlatype].append(hla_file) if len(out_files) > 0: hla_outdir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "hla")) merged_hlas = [] for hlatype, files in out_files.items(): out_file = os.path.join( hla_outdir, "%s-%s.fq" % (dd.get_sample_name(data), hlatype)) optitype.combine_hla_fqs([(hlatype, f) for f in files], out_file, data) merged_hlas.append(out_file) data = tz.update_in(data, hla_key, lambda x: merged_hlas) return data
def _get_output_cwl_keys(fnargs): """Retrieve output_cwl_keys from potentially nested input arguments. """ for d in utils.flatten(fnargs): if isinstance(d, dict) and d.get("output_cwl_keys"): return d["output_cwl_keys"] raise ValueError("Did not find output_cwl_keys in %s" % (pprint.pformat(fnargs)))
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): # For tumor/normal batches, want to attach germline VCFs to normals # Standard somatics go to tumors if dd.get_phenotype(data) == "normal": test_name += "-germline" if os.path.basename(f).startswith( ("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [utils.to_single_data(x) for x in utils.flatten(items)] out = {"sv": {"calls": []}} added = set([]) for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) return [out]
def _write_wdl_outputs(argfile, out_keys): """Write variables as WDL compatible output files. Writes individual files prefixed with 'wdl.output' that can be read by WDL standard library functions: https://github.com/broadinstitute/wdl/blob/develop/SPEC.md#outputs """ out_basename = "wdl.output.%s.txt" with open(argfile) as in_handle: outputs = json.load(in_handle) record_name, record_attrs = _get_record_attrs(out_keys) if record_name: recs = outputs[record_name] with open(out_basename % record_name, "w") as out_handle: writer = csv.writer(out_handle) if not isinstance(recs, (list, tuple)): recs = [recs] recs = list(utils.flatten(recs)) keys = sorted(list(set(reduce(operator.add, [r.keys() for r in recs])))) writer.writerow(keys) for rec in recs: writer.writerow([_cwlvar_to_wdl(rec.get(k)) for k in keys]) else: for key in out_keys: with open(out_basename % key, "w") as out_handle: vals = _cwlvar_to_wdl(outputs.get(key)) if not isinstance(vals, (list, tuple)): vals = [vals] for val in vals: if isinstance(val, (list, tuple)): val = "\t".join([str(x) for x in val]) out_handle.write(str(val) + "\n")
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): if os.path.basename(f).startswith( ("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def _get_vcf_samples(calls, items): have_full_file = False all_samples = set([]) sample_matches = False for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for data in items: for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)): # For tumor/normal batches, want to attach germline VCFs to normals # Standard somatics go to tumors if dd.get_phenotype(data) == "normal": test_name += "-germline" if os.path.basename(f).startswith(("%s-" % test_name, "%s." % test_name)): # Prefer matches to single samples (gVCF) over joint batches if i == 0: sample_matches = True if sample_matches and i > 0: continue else: all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def summarize_grading(samples, vkey="validate"): """Provide summaries of grading results across all samples. Handles both traditional pipelines (validation part of variants) and CWL pipelines (validation at top level) """ samples = list(utils.flatten(samples)) if not _has_grading_info(samples, vkey): return [[d] for d in samples] validate_dir = utils.safe_makedir( os.path.join(samples[0]["dirs"]["work"], vkey)) header = ["sample", "caller", "variant.type", "category", "value"] validated, out = _group_validate_samples(samples, vkey) for vname, vitems in validated.items(): out_csv = os.path.join(validate_dir, "grading-summary-%s.csv" % vname) with open(out_csv, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(header) plot_data = [] plot_files = [] for data in sorted( vitems, key=lambda x: x.get("lane", dd.get_sample_name(x))): validations = [ variant.get(vkey) for variant in data.get("variants", []) ] validations = [v for v in validations if v] if len(validations) == 0 and vkey in data: validations = [data.get(vkey)] for validate in validations: if validate: validate["grading_summary"] = out_csv if validate.get("grading"): for row in _get_validate_plotdata_yaml( validate["grading"], data): writer.writerow(row) plot_data.append(row) elif validate.get("summary") and not validate.get( "summary") == "None": if isinstance(validate["summary"], (list, tuple)): plot_files.extend( list(set(validate["summary"]))) else: plot_files.append(validate["summary"]) if plot_files: plots = validateplot.classifyplot_from_plotfiles( plot_files, out_csv) elif plot_data: plots = validateplot.create(plot_data, header, 0, data["config"], os.path.splitext(out_csv)[0]) else: plots = [] for data in vitems: if data.get(vkey): data[vkey]["grading_plots"] = plots for variant in data.get("variants", []): if variant.get(vkey): variant[vkey]["grading_plots"] = plots out.append([data]) return out
def test_disambiguate(self): in_files = self.config["input_bamdiff"] disambiguate = sam.Disambiguate(self.config) output = list(flatten(disambiguate(in_files))) out_md5 = map(self._get_md5, output) correct_files = self._correct_files(output) correct_md5 = map(self._get_md5, correct_files) self.assertTrue(out_md5 == correct_md5)
def _get_vcf_samples(calls): all_samples = set([]) for f in utils.flatten(calls): cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) return list(all_samples)
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) return [[data] for data in samples]
def summarize_grading(samples, vkey="validate"): """Provide summaries of grading results across all samples. Handles both traditional pipelines (validation part of variants) and CWL pipelines (validation at top level) """ samples = list(utils.flatten(samples)) if not _has_grading_info(samples, vkey): return [[d] for d in samples] validate_dir = utils.safe_makedir(os.path.join(samples[0]["dirs"]["work"], vkey)) header = ["sample", "caller", "variant.type", "category", "value"] _summarize_combined(samples, vkey) validated, out = _group_validate_samples(samples, vkey, (["metadata", "validate_batch"], ["metadata", "batch"], ["description"])) for vname, vitems in validated.items(): out_csv = os.path.join(validate_dir, "grading-summary-%s.csv" % vname) with open(out_csv, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(header) plot_data = [] plot_files = [] for data in sorted(vitems, key=lambda x: x.get("lane", dd.get_sample_name(x)) or ""): validations = [variant.get(vkey) for variant in data.get("variants", []) if isinstance(variant, dict)] validations = [v for v in validations if v] if len(validations) == 0 and vkey in data: validations = [data.get(vkey)] for validate in validations: if validate: validate["grading_summary"] = out_csv if validate.get("grading"): for row in _get_validate_plotdata_yaml(validate["grading"], data): writer.writerow(row) plot_data.append(row) elif validate.get("summary") and not validate.get("summary") == "None": if isinstance(validate["summary"], (list, tuple)): plot_files.extend(list(set(validate["summary"]))) else: plot_files.append(validate["summary"]) if plot_files: plots = validateplot.classifyplot_from_plotfiles(plot_files, out_csv) elif plot_data: plots = validateplot.create(plot_data, header, 0, data["config"], os.path.splitext(out_csv)[0]) else: plots = [] for data in vitems: if data.get(vkey): data[vkey]["grading_plots"] = plots for variant in data.get("variants", []): if isinstance(variant, dict) and variant.get(vkey): variant[vkey]["grading_plots"] = plots out.append([data]) return out
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = { "validate": validate.combine_validations(items), "variants": { "calls": [], "gvcf": [], "samples": [] } } added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller( data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) else: export_tmp = "" path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files += glob.glob(os.path.join(out_dir, "multiqc_config.yaml")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append(file_list_final) out.append([data]) return out
def stringtie_merge(*samples): to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)])) data = samples[0][0] ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def _get_vcf_samples(calls, data): have_full_file = False all_samples = set([]) for f in utils.flatten(calls): if have_full_file: cur = set(vcfutils.get_samples(f)) if cur: if not all_samples: all_samples = cur else: all_samples &= set(cur) else: for test_name in [dd.get_sample_name(data)] + dd.get_batches(data): if os.path.basename(f).startswith("%s-" % test_name): all_samples.add(dd.get_sample_name(data)) return list(all_samples)
def cufflinks_merge(*samples): to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)])) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = {"validate": validate.combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller(data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def _merge_metadata(samples): """Merge all metadata into CSV file""" samples = list(utils.flatten(samples)) out_dir = dd.get_work_dir(samples[0]) logger.info("summarize metadata") out_file = os.path.join(out_dir, "metadata.csv") sample_metrics = collections.defaultdict(dict) for s in samples: m = tz.get_in(['metadata'], s) if isinstance(m, six.string_types): m = json.loads(m) if m: for me in list(m.keys()): if isinstance(m[me], list) or isinstance(m[me], dict) or isinstance(m[me], tuple): m.pop(me, None) sample_metrics[dd.get_sample_name(s)].update(m) pd.DataFrame(sample_metrics).transpose().to_csv(out_file) return out_file
def _get_coverage_per_region(name): """ Parse coverage file if it exists to get average value. """ fns = tz.get_in(["summary", "qc", "coverage"], name, {}) if fns: fns = utils.flatten(fns.values()) fn = [fn for fn in fns if fn.find("coverage_fixed.bed") > -1] if fn: fn = fn[0] if utils.file_exists(fn): logger.debug("Reading meanCoverage for: %s" % fn) try: dt = pd.read_csv(fn, sep="\t", index_col=False) if "meanCoverage" in dt: if len(dt["meanCoverage"]) > 0: return "%.3f" % (sum(map(float, dt['meanCoverage'])) / len(dt['meanCoverage'])) except TypeError: logger.debug("%s has no lines in coverage.bed" % name) return "NA"
def _get_coverage_per_region(name): """ Parse coverage file if it exists to get average value. """ fns = tz.get_in(["summary", "qc", "coverage"], name, {}) if fns: fns = utils.flatten(fns.values()) fn = [fn for fn in fns if fn.find("coverage_fixed.bed") > -1] if fn: fn = fn[0] if utils.file_exists(fn): logger.debug("Reading meanCoverage for: %s" % fn) try: dt = pd.read_csv(fn, sep="\t", index_col=False) if "meanCoverage" in dt: if len(dt["meanCoverage"]) > 0: return "%.3f" % ( sum(map(float, dt['meanCoverage'])) / len(dt['meanCoverage'])) except TypeError: logger.debug("%s has no lines in coverage.bed" % name) return "NA"
def main(config_file, fastq_dir): with open(config_file) as in_handle: config = yaml.load(in_handle) barcode_info = config["barcodes"] print "Processing %s." % (fastq_dir) in_files = glob.glob(os.path.join(fastq_dir, "*.fastq")) print "Found %s in %s. " % (in_files, fastq_dir) print "Combining paired-end files, if found." pairs = combine_pairs(in_files) print "Calulcated pairs: %s." % (pairs) out_files = [] for pair in pairs: barcode = _determine_barcode_from_filename(pair[0]) print "Detected barcode: %s" % barcode if barcode not in barcode_info.keys(): print "barcode %s not found in the YAML file, skipping." % ( barcode) continue print "Sample ID: %s" % (barcode_info[barcode][0]) type = barcode_info[barcode][1] print "Sample type: %s" % (barcode_info[barcode][1]) to_trim = config["to_trim"][type] cutadapt_dir = "cutadapt" print("Trimming off %s and any bases before it from %s." % (to_trim[0], pair[0])) out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0])) out_files.append(_trim_from_front(pair[0], to_trim[0])) if len(pair) > 1: print("Trimming off %s and any bases before it from %s." % (to_trim[1], pair[1])) out_files.append(_trim_from_front(pair[1], to_trim[1])) out_files = list(flatten(out_files)) out_files = combine_pairs(out_files) for pair in out_files: if len(pair) > 1: filter_reads_by_length(pair[0], pair[1], "fastq-sanger") else: filter_single_reads_by_length(pair[0], "fastq-sanger")
def main(config_file, fastq_dir): with open(config_file) as in_handle: config = yaml.load(in_handle) barcode_info = config["barcodes"] print "Processing %s." % (fastq_dir) in_files = glob.glob(os.path.join(fastq_dir, "*.fastq")) print "Found %s in %s. " % (in_files, fastq_dir) print "Combining paired-end files, if found." pairs = combine_pairs(in_files) print "Calulcated pairs: %s." % (pairs) out_files = [] for pair in pairs: barcode = _determine_barcode_from_filename(pair[0]) print "Detected barcode: %s" % barcode if barcode not in barcode_info.keys(): print "barcode %s not found in the YAML file, skipping." % (barcode) continue print "Sample ID: %s" % (barcode_info[barcode][0]) type = barcode_info[barcode][1] print "Sample type: %s" % (barcode_info[barcode][1]) to_trim = config["to_trim"][type] cutadapt_dir = "cutadapt" print ("Trimming off %s and any bases before it from %s." % (to_trim[0], pair[0])) out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0])) out_files.append(_trim_from_front(pair[0], to_trim[0])) if len(pair) > 1: print ("Trimming off %s and any bases before it from %s." % (to_trim[1], pair[1])) out_files.append(_trim_from_front(pair[1], to_trim[1])) out_files = list(flatten(out_files)) out_files = combine_pairs(out_files) for pair in out_files: if len(pair) > 1: filter_reads_by_length(pair[0], pair[1], "fastq-sanger") else: filter_single_reads_by_length(pair[0], "fastq-sanger")
def _merge_hla_fastq_inputs(data): """Merge HLA inputs from a split initial alignment. """ hla_key = ["hla", "fastq"] hla_sample_files = [x for x in (tz.get_in(hla_key, data) or []) if x and x != "None"] merged_hlas = None if hla_sample_files: out_files = collections.defaultdict(list) for hla_file in utils.flatten(hla_sample_files): rehla = re.search(r".hla.(?P<hlatype>[\w-]+).fq", hla_file) if rehla: hlatype = rehla.group("hlatype") out_files[hlatype].append(hla_file) if len(out_files) > 0: hla_outdir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "hla")) merged_hlas = [] for hlatype, files in out_files.items(): out_file = os.path.join(hla_outdir, "%s-%s.fq" % (dd.get_sample_name(data), hlatype)) optitype.combine_hla_fqs([(hlatype, f) for f in files], out_file, data) merged_hlas.append(out_file) data = tz.update_in(data, hla_key, lambda x: merged_hlas) return data
def update_summary_qc(data, key, base=None, secondary=None): """ updates summary_qc, keyed by key. key is generally the program the quality control metrics came from. if key already exists, the specified base/secondary files are added as secondary files to the existing key, removing duplicates. stick files into summary_qc if you want them propagated forward and available for multiqc """ summary = deepish_copy(get_summary_qc(data, {})) files = [[base], [secondary], tz.get_in([key, "base"], summary, []), tz.get_in([key, "secondary"], summary, [])] files = list(set([x for x in flatten(files) if x])) base = tz.first(files) secondary = list(tz.drop(1, files)) if base and secondary: summary[key] = {"base": base, "secondary": secondary} elif base: summary[key] = {"base": base} data = set_summary_qc(data, summary) return data
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ("{path_export}{export_tmp}{locale_export} " "{multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}") do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append(tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def _get_rv_adapters(data): builtin = [RV_ADAPTERS[x] for x in dd.get_adapters(data) if x in FW_ADAPTERS] return flatten(builtin + dd.get_custom_trim(data))
def _get_rv_adapters(data): builtin = [ RV_ADAPTERS[x] for x in dd.get_adapters(data) if x in FW_ADAPTERS ] return flatten(builtin + dd.get_custom_trim(data))
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = [ cwlutils.unpack_tarballs(utils.deepish_copy(x), x) for x in samples ] work_samples = _report_summary(work_samples, os.path.join(out_dir, "report")) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir( samples[0]) else: export_tmp = "" path_export = utils.local_path_export() cmd = "{path_export}{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out}" do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(_group_by_samplename(samples)): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.yaml")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) data_files.append(file_list) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } file_list_final = _save_uploaded_file_list( samples, file_list, out_dir) if file_list_final: data["summary"]["multiqc"]["secondary"].append( file_list_final) out.append([data]) return out