def _get_maxcov_downsample(data): """Calculate maximum coverage downsampling for whole genome samples. Returns None if we're not doing downsampling. """ from bcbio.bam import ref from bcbio.ngsalign import alignprep, bwa from bcbio.variation import coverage params = {"min_coverage_for_downsampling": 10, "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)} fastq_file = data["files"][0] num_reads = alignprep.total_reads_from_grabix(fastq_file) if num_reads and params["maxcov_downsample_multiplier"] and params["maxcov_downsample_multiplier"] > 0: vrs = dd.get_variant_regions_merged(data) total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() genome_cov_pct = callable_size / float(total_size) else: callable_size = total_size genome_cov_pct = 1.0 if (genome_cov_pct > coverage.GENOME_COV_THRESH and dd.get_coverage_interval(data) in ["genome", None, False]): total_counts, total_sizes = 0, 0 for count, size in bwa.fastq_size_output(fastq_file, 5000): total_counts += int(count) total_sizes += (int(size) * int(count)) read_size = float(total_sizes) / float(total_counts) avg_cov = float(num_reads * read_size) / callable_size if avg_cov >= params["min_coverage_for_downsampling"]: return int(avg_cov * params["maxcov_downsample_multiplier"]) return None
def get_downsample_params(data): ds_mult = dd.get_maxcov_downsample(data) if ds_mult and ds_mult > 0: return { "min_coverage_for_downsampling": 10, "maxcov_downsample_multiplier": ds_mult }
def get_downsample_params(data): ds_mult = dd.get_maxcov_downsample(data) if ds_mult and ds_mult > 0: return {"min_coverage_for_downsampling": 10, "maxcov_downsample_multiplier": ds_mult}