def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov", "peddy" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) # tumor-only somatic with germline extraction or dd.get_phenotype(s) == "germline" # or paired somatic with germline calling for normal for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) # tumor-only somatic with germline extraction or dd.get_phenotype(s) == "germline" # or paired somatic with germline calling for normal for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}" message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0 for l in to_show]): logger.info("Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping") return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def run(bam_file, data, out_dir): out = {} vcinfo = variant.get_active_vcinfo(data) if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]): out_file = os.path.join(utils.safe_makedir(out_dir), "%s-damage.yaml" % (dd.get_sample_name(data))) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = ["dkfzbiasfilter_summarize.py", "--sample=%s" % dd.get_sample_name(data), "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]] do.run(cmd, "Summarize damage filtering") if utils.file_exists(out_file): out["base"] = out_file return out
def run(bam_file, data, out_dir): out = {} vcinfo = variant.get_active_vcinfo(data, use_ensemble=False) dkfzbiasfilter = config_utils.get_program("dkfzbiasfilter_summarize.py", data) if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]): out_file = os.path.join(utils.safe_makedir(out_dir), "%s-damage.yaml" % (dd.get_sample_name(data))) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [dkfzbiasfilter, "--sample=%s" % dd.get_sample_name(data), "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]] do.run(cmd, "Summarize damage filtering") if utils.file_exists(out_file): out["base"] = out_file return out
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): # Hiding metrics duplicated by Qualimap out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths avg_depths = [tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples] avg_depths = [x for x in avg_depths if x] # Picking all thresholds up to the highest sample average depth thresholds = [t for t in coverage.DEPTH_THRESHOLDS if not avg_depths or t <= max(avg_depths)] # ...plus one more if len(thresholds) < len(coverage.DEPTH_THRESHOLDS): thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)]) # Showing only thresholds surrounding any of average depths thresholds_hidden = [] for i, t in enumerate(thresholds): if t > 20: # Not hiding anything below 20x if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \ any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)): pass else: thresholds_hidden.append(t) # Hide coverage unless running full qualimap, downsampled inputs are confusing if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples): thresholds_hidden = thresholds + thresholds_hidden thresholds_hidden.sort() thresholds = [] out['qualimap_config'] = { 'general_stats_coverage': [str(t) for t in thresholds], 'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden]} # Avoid confusing peddy outputs, sticking to ancestry and sex prediction out["table_columns_visible"]["Peddy"] = {"family_id": False, "sex_het_ratio": False, "error_sex_check": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov", "peddy" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) or # tumor-only somatic with germline extraction dd.get_phenotype(s) == "germline" or # or paired somatic with germline calling for normal _has_bcftools_germline_stats(s) # CWL organized statistics for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "salmon", "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def _create_config_file(out_dir, samples): """Provide configuration file for multiqc report.""" out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} extra_fn_clean_trim = [] extra_fn_clean_trim.extend( ["coverage.mosdepth.region.dist", "coverage.mosdepth.global.dist"]) out["extra_fn_clean_trim"] = extra_fn_clean_trim # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): # Hiding metrics duplicated by Qualimap out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths avg_depths = [ tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples ] avg_depths = [x for x in avg_depths if x] # Picking all thresholds up to the highest sample average depth thresholds = [ t for t in coverage.DEPTH_THRESHOLDS if not avg_depths or t <= max(avg_depths) ] # ...plus one more if len(thresholds) < len(coverage.DEPTH_THRESHOLDS): thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)]) # Showing only thresholds surrounding any of average depths thresholds_hidden = [] for i, t in enumerate(thresholds): if t > 20: # Not hiding anything below 20x if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \ any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)): pass else: thresholds_hidden.append(t) # Hide coverage unless running full qualimap, downsampled inputs are confusing if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples): thresholds_hidden = thresholds + thresholds_hidden thresholds_hidden.sort() thresholds = [] out['qualimap_config'] = { 'general_stats_coverage': [str(t) for t in thresholds], 'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden] } # Avoid confusing peddy outputs, sticking to ancestry and sex prediction out["table_columns_visible"]["Peddy"] = { "family_id": False, "sex_het_ratio": False, "error_sex_check": False } # Setting the module order module_order = [] module_order.extend(["bcbio", "samtools", "goleft_indexcov", "peddy"]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) or # tumor-only somatic with germline extraction dd.get_phenotype(s) == "germline" or # or paired somatic with germline calling for normal _has_bcftools_germline_stats(s) # CWL organized statistics for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([ { 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'custom_config': { 'write_general_stats': True }, } }, { 'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'custom_config': { 'write_general_stats': False }, } }, ]) else: module_order.append("bcftools") module_order.extend([ "salmon", "star", "picard", "qualimap", "snpeff", "bismark", "fastqc", "preseq" ]) out["module_order"] = module_order preseq_samples = [ s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s) ] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)