def test_antitarget(self): """The 'antitarget' command.""" baits = tabio.read_auto('formats/nv2_baits.interval_list') access = tabio.read_auto('../data/access-5k-mappable.hg19.bed') self.assertLess(0, len(commands.do_antitarget(baits))) self.assertLess(0, len(commands.do_antitarget(baits, access))) self.assertLess(0, len(commands.do_antitarget(baits, access, 200000))) self.assertLess( 0, len(commands.do_antitarget(baits, access, 10000, 5000)))
def test_antitarget(self): """The 'antitarget' command.""" baits = tabio.read_auto('formats/nv2_baits.interval_list') access = tabio.read_auto('../data/access-5k-mappable.hg19.bed') self.assertLess(0, len(commands.do_antitarget(baits))) self.assertLess(0, len(commands.do_antitarget(baits, access))) self.assertLess(0, len(commands.do_antitarget(baits, access, 200000))) self.assertLess(0, len(commands.do_antitarget(baits, access, 10000, 5000)))
def test_read_auto(self): for fname, nrows in (("formats/empty", 0), ("formats/amplicon.bed", 1433), ("formats/amplicon.text", 1433), ("formats/nv2_baits.interval_list", 6809), ("formats/refflat-mini.txt", 100), ("formats/example.gff", 6), ): self.assertEqual(len(tabio.read_auto(fname)), nrows) with open(fname) as handle: self.assertEqual(len(tabio.read_auto(handle)), nrows)
def test_target(self): """The 'target' command.""" #return # DBG annot_fname = "formats/refflat-mini.txt" for bait_fname in ("formats/nv2_baits.interval_list", "formats/amplicon.bed", "formats/baits-funky.bed"): baits = tabio.read_auto(bait_fname) bait_len = len(baits) # No splitting: w/o and w/ re-annotation r1 = commands.do_target(baits) self.assertEqual(len(r1), bait_len) r1a = commands.do_target(baits, do_short_names=True, annotate=annot_fname) self.assertEqual(len(r1a), len(r1)) # Splitting, w/o and w/ re-annotation r2 = commands.do_target(baits, do_short_names=True, do_split=True, avg_size=100) self.assertGreater(len(r2), len(r1)) for _c, subarr in r2.by_chromosome(): self.assertTrue(subarr.start.is_monotonic_increasing, bait_fname) self.assertTrue(subarr.end.is_monotonic_increasing, bait_fname) # Bins are non-overlapping; next start >= prev. end self.assertTrue( ((subarr.start.values[1:] - subarr.end.values[:-1]) >= 0).all()) r2a = commands.do_target(baits, do_short_names=True, do_split=True, avg_size=100, annotate=annot_fname) self.assertEqual(len(r2a), len(r2)) # Original regions object should be unmodified self.assertEqual(len(baits), bait_len)
def test_target(self): """The 'target' command.""" annot_fname = "formats/refflat-mini.txt" for bait_fname in ("formats/nv2_baits.interval_list", "formats/amplicon.bed"): baits = tabio.read_auto(bait_fname) bait_len = len(baits) # No splitting: w/ and w/o re-annotation r1 = commands.do_target(baits) self.assertEqual(len(r1), bait_len) r1a = commands.do_target(baits, do_short_names=True, annotate=annot_fname) self.assertEqual(len(r1a), len(r1)) # Splitting r2 = commands.do_target(baits, do_short_names=True, do_split=True, avg_size=100) self.assertGreater(len(r2), len(r1)) r2a = commands.do_target(baits, do_short_names=True, do_split=True, avg_size=100, annotate=annot_fname) self.assertEqual(len(r2a), len(r2)) # Original regions object should be unmodified self.assertEqual(len(baits), bait_len)
def bed2probes(bed_fname): """Create a neutral-coverage CopyNumArray from a file of regions.""" regions = tabio.read_auto(bed_fname) table = regions.data.loc[:, ("chromosome", "start", "end")] table["gene"] = (regions.data["gene"] if "gene" in regions.data else '-') table["log2"] = 0.0 table["spread"] = 0.0 return CNA(table, {"sample_id": core.fbase(bed_fname)})
def filter_targets(target_bed, sample_bams, procs): """Check if each potential target has significant coverage.""" baits = tabio.read_auto(target_bed) # Loop over BAMs to calculate weighted averages of bin coverage depths total_depths = np.zeros(len(baits), dtype=np.float_) for bam_fname in sample_bams: logging.info("Evaluating targets in %s", bam_fname) sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs) total_depths += sample['depth'].values baits['depth'] = total_depths / len(sample_bams) return baits
def test_total_range_size(self): """Test total region coverage calculation.""" for fname, area in ( ('formats/empty', 0), ('formats/my-targets.bed', 103), ('formats/dac-my.bed', 148), ('formats/example.gff', 7951), ('formats/refflat-mini.txt', 719715), ): regions = tabio.read_auto(fname) self.assertEqual(regions.total_range_size(), area)
def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1): """Calculate log2 coverages in the BAM file at each interval.""" regions = tabio.read_auto(bed_fname) if procs == 1: bamfile = pysam.Samfile(bam_fname, 'rb') for chrom, subregions in regions.by_chromosome(): logging.info("Processing chromosome %s of %s", chrom, os.path.basename(bam_fname)) for count, row in _rdc_chunk(bamfile, subregions, min_mapq): yield [count, row] else: with futures.ProcessPoolExecutor(procs) as pool: args_iter = ((bam_fname, subr, min_mapq) for _c, subr in regions.by_chromosome()) for chunk in pool.map(_rdc, args_iter): for count, row in chunk: yield [count, row]
def do_target(bait_arr, annotate=None, do_short_names=False, do_split=False, avg_size=200/.75): """Transform bait intervals into targets more suitable for CNVkit.""" tgt_arr = bait_arr.copy() # Drop zero-width regions tgt_arr = tgt_arr[tgt_arr.start != tgt_arr.end] if do_split: logging.info("Splitting large targets") tgt_arr = tgt_arr.subdivide(avg_size, 0) if annotate: logging.info("Applying annotations as target names") annotation = tabio.read_auto(annotate) antitarget.compare_chrom_names(tgt_arr, annotation) tgt_arr['gene'] = annotation.into_ranges(tgt_arr, 'gene', '-') if do_short_names: logging.info("Shortening target interval labels") tgt_arr['gene'] = list(shorten_labels(tgt_arr['gene'])) return tgt_arr
def do_target(bait_arr, annotate=None, do_short_names=False, do_split=False, avg_size=200 / .75): """Transform bait intervals into targets more suitable for CNVkit.""" tgt_arr = bait_arr.copy() # Drop zero-width regions tgt_arr = tgt_arr[tgt_arr.start != tgt_arr.end] if do_split: logging.info("Splitting large targets") tgt_arr = tgt_arr.subdivide(avg_size, 0) if annotate: logging.info("Applying annotations as target names") annotation = tabio.read_auto(annotate) antitarget.compare_chrom_names(tgt_arr, annotation) tgt_arr['gene'] = annotation.into_ranges(tgt_arr, 'gene', '-') if do_short_names: logging.info("Shortening target interval labels") tgt_arr['gene'] = list(shorten_labels(tgt_arr['gene'])) return tgt_arr
def batch_make_reference(normal_bams, target_bed, antitarget_bed, male_reference, fasta, annotate, short_names, target_avg_size, access_bed, antitarget_avg_size, antitarget_min_size, output_reference, output_dir, processes, by_count, method): """Build the CN reference from normal samples, targets and antitargets.""" if method in ("wgs", "amplicon"): if antitarget_bed: raise ValueError("%r protocol: antitargets should not be " "given/specified." % method) if access_bed and target_bed and access_bed != target_bed: raise ValueError("%r protocol: targets and access should not be " "different." % method) bait_arr = None if method == "wgs": if not annotate: # TODO check if target_bed has gene names logging.warning("WGS protocol: recommend '--annotate' option " "(e.g. refFlat.txt) to help locate genes " "in output files.") access_arr = None if not target_bed: # TODO - drop weird contigs before writing, see antitargets.py if access_bed: target_bed = access_bed elif fasta: # Run 'access' on the fly access_arr = access.do_access(fasta) # Take filename base from FASTA, lacking any other clue target_bed = os.path.splitext( os.path.basename(fasta))[0] + ".bed" tabio.write(access_arr, target_bed, "bed3") else: raise ValueError("WGS protocol: need to provide --targets, " "--access, or --fasta options.") # Tweak default parameters if not target_avg_size: if normal_bams: # Calculate bin size from .bai & access if fasta and not access_arr: # Calculate wgs depth from all # sequencing-accessible area (it doesn't take that long # compared to WGS coverage); user-provided access might be # something else that excludes a significant number of # mapped reads. access_arr = access.do_access(fasta) if access_arr: autobin_args = ['wgs', access_arr] else: # Don't assume the given targets/access covers the whole # genome; use autobin sampling to estimate bin size, as we # do for amplicon bait_arr = tabio.read_auto(target_bed) autobin_args = ['amplicon', bait_arr] # Choose median-size normal bam or tumor bam bam_fname = autobin.midsize_file(normal_bams) (wgs_depth, target_avg_size), _ = autobin.do_autobin(bam_fname, *autobin_args, bp_per_bin=50000.) logging.info("WGS average depth %.2f --> using bin size %d", wgs_depth, target_avg_size) else: # This bin size is OK down to 10x target_avg_size = 5000 # To make temporary filenames for processed targets or antitargets tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed)) if output_dir: tgt_name_base = os.path.join(output_dir, tgt_name_base) # Pre-process baits/targets new_target_fname = tgt_name_base + '.target.bed' if bait_arr is None: bait_arr = tabio.read_auto(target_bed) target_arr = target.do_target( bait_arr, annotate, short_names, True, **({ 'avg_size': target_avg_size } if target_avg_size else {})) tabio.write(target_arr, new_target_fname, 'bed4') target_bed = new_target_fname if not antitarget_bed: # Devise a temporary antitarget filename antitarget_bed = tgt_name_base + '.antitarget.bed' if method == "hybrid": # Build antitarget BED from the given targets anti_kwargs = {} if access_bed: anti_kwargs['access'] = tabio.read_auto(access_bed) if antitarget_avg_size: anti_kwargs['avg_bin_size'] = antitarget_avg_size if antitarget_min_size: anti_kwargs['min_bin_size'] = antitarget_min_size anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs) else: # No antitargets for wgs, amplicon anti_arr = GA([]) tabio.write(anti_arr, antitarget_bed, "bed4") if len(normal_bams) == 0: logging.info("Building a flat reference...") ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta, male_reference) else: logging.info("Building a copy number reference from normal samples...") # Run coverage on all normals with parallel.pick_pool(processes) as pool: tgt_futures = [] anti_futures = [] procs_per_cnn = max(1, processes // (2 * len(normal_bams))) for nbam in normal_bams: sample_id = core.fbase(nbam) sample_pfx = os.path.join(output_dir, sample_id) tgt_futures.append( pool.submit(batch_write_coverage, target_bed, nbam, sample_pfx + '.targetcoverage.cnn', by_count, procs_per_cnn)) anti_futures.append( pool.submit(batch_write_coverage, antitarget_bed, nbam, sample_pfx + '.antitargetcoverage.cnn', by_count, procs_per_cnn)) target_fnames = [tf.result() for tf in tgt_futures] antitarget_fnames = [af.result() for af in anti_futures] # Build reference from *.cnn ref_arr = reference.do_reference(target_fnames, antitarget_fnames, fasta, male_reference, None, do_gc=True, do_edge=(method == "hybrid"), do_rmask=True) if not output_reference: output_reference = os.path.join(output_dir, "reference.cnn") core.ensure_path(output_reference) tabio.write(ref_arr, output_reference) return output_reference, target_bed, antitarget_bed
def batch_make_reference(normal_bams, target_bed, antitarget_bed, male_reference, fasta, annotate, short_names, target_avg_size, access_bed, antitarget_avg_size, antitarget_min_size, output_reference, output_dir, processes, by_count, method): """Build the CN reference from normal samples, targets and antitargets.""" if method in ("wgs", "amplicon"): if antitarget_bed: raise ValueError("%r protocol: antitargets should not be " "given/specified." % method) if access_bed and target_bed and access_bed != target_bed: raise ValueError("%r protocol: targets and access should not be " "different." % method) bait_arr = None if method == "wgs": if not annotate: # TODO check if target_bed has gene names logging.warning("WGS protocol: recommend '--annotate' option " "(e.g. refFlat.txt) to help locate genes " "in output files.") access_arr = None if not target_bed: # TODO - drop weird contigs before writing, see antitargets.py if access_bed: target_bed = access_bed elif fasta: # Run 'access' on the fly access_arr = access.do_access(fasta) # Take filename base from FASTA, lacking any other clue target_bed = os.path.splitext(os.path.basename(fasta) )[0] + ".bed" tabio.write(access_arr, target_bed, "bed3") else: raise ValueError("WGS protocol: need to provide --targets, " "--access, or --fasta options.") # Tweak default parameters if not target_avg_size: if normal_bams: # Calculate bin size from .bai & access if fasta and not access_arr: # Calculate wgs depth from all # sequencing-accessible area (it doesn't take that long # compared to WGS coverage); user-provided access might be # something else that excludes a significant number of # mapped reads. access_arr = access.do_access(fasta) if access_arr: autobin_args = ['wgs', None, access_arr] else: # Don't assume the given targets/access covers the whole # genome; use autobin sampling to estimate bin size, as we # do for amplicon bait_arr = tabio.read_auto(target_bed) autobin_args = ['amplicon', bait_arr] # Choose median-size normal bam or tumor bam bam_fname = autobin.midsize_file(normal_bams) (wgs_depth, target_avg_size), _ = autobin.do_autobin( bam_fname, *autobin_args, bp_per_bin=50000.) logging.info("WGS average depth %.2f --> using bin size %d", wgs_depth, target_avg_size) else: # This bin size is OK down to 10x target_avg_size = 5000 # To make temporary filenames for processed targets or antitargets tgt_name_base, _tgt_ext = os.path.splitext(os.path.basename(target_bed)) if output_dir: tgt_name_base = os.path.join(output_dir, tgt_name_base) # Pre-process baits/targets new_target_fname = tgt_name_base + '.target.bed' if bait_arr is None: bait_arr = tabio.read_auto(target_bed) target_arr = target.do_target(bait_arr, annotate, short_names, True, **({'avg_size': target_avg_size} if target_avg_size else {})) tabio.write(target_arr, new_target_fname, 'bed4') target_bed = new_target_fname if not antitarget_bed: # Devise a temporary antitarget filename antitarget_bed = tgt_name_base + '.antitarget.bed' if method == "hybrid": # Build antitarget BED from the given targets anti_kwargs = {} if access_bed: anti_kwargs['access'] = tabio.read_auto(access_bed) if antitarget_avg_size: anti_kwargs['avg_bin_size'] = antitarget_avg_size if antitarget_min_size: anti_kwargs['min_bin_size'] = antitarget_min_size anti_arr = antitarget.do_antitarget(target_arr, **anti_kwargs) else: # No antitargets for wgs, amplicon anti_arr = GA([]) tabio.write(anti_arr, antitarget_bed, "bed4") if len(normal_bams) == 0: logging.info("Building a flat reference...") ref_arr = reference.do_reference_flat(target_bed, antitarget_bed, fasta, male_reference) else: logging.info("Building a copy number reference from normal samples...") # Run coverage on all normals with parallel.pick_pool(processes) as pool: tgt_futures = [] anti_futures = [] procs_per_cnn = max(1, processes // (2 * len(normal_bams))) for nbam in normal_bams: sample_id = core.fbase(nbam) sample_pfx = os.path.join(output_dir, sample_id) tgt_futures.append( pool.submit(batch_write_coverage, target_bed, nbam, sample_pfx + '.targetcoverage.cnn', by_count, procs_per_cnn)) anti_futures.append( pool.submit(batch_write_coverage, antitarget_bed, nbam, sample_pfx + '.antitargetcoverage.cnn', by_count, procs_per_cnn)) target_fnames = [tf.result() for tf in tgt_futures] antitarget_fnames = [af.result() for af in anti_futures] # Build reference from *.cnn ref_arr = reference.do_reference(target_fnames, antitarget_fnames, fasta, male_reference, None, do_gc=True, do_edge=(method == "hybrid"), do_rmask=True) if not output_reference: output_reference = os.path.join(output_dir, "reference.cnn") core.ensure_path(output_reference) tabio.write(ref_arr, output_reference) return output_reference, target_bed, antitarget_bed
def main(args): annot = tabio.read_auto(args.annotate) cnarr = read_cna(args.cnv_file) cnarr['gene'] = annot.into_ranges(cnarr, 'gene', '-') tabio.write(cnarr, args.output or sys.stdout)