def read_paired_genes(cbs1, cbs2, interval): """Get the segment CN values for each targeted region. For genes with 2 or more segments, take the longest segment (or [weighted] average). """ segments1 = cnvlib.read(cbs1) segments2 = cnvlib.read(cbs2) non_overlapping = set(segments1.chromosome).symmetric_difference( set(segments2.chromosome)) non_overlapping = [chrom for chrom in non_overlapping if not is_skipped_chromosome(chrom)] if non_overlapping: raise ValueError("Mismatched chromosomes: " + ' '.join(sorted(non_overlapping))) segments1.sort() segments2.sort() for s1_chrom, s1_start, s1_end, s1_name, s1_value, s1_probes in segments1: if s1_probes < MIN_ACGH_PROBES or is_skipped_chromosome(s1_chrom): continue s1_name = "{}:{}-{}".format(s1_chrom, s1_start, s1_end) seglike2 = segments2.in_range(s1_chrom, s1_start, s1_end, trim=True) if len(seglike2) == 0: print("Skipping", s1_name, "-- covers no CNVkit segments") continue s2_value = segment_cn(seglike2) yield (s1_chrom, s1_value, s2_value, s1_start, s1_end, s1_name)
def main(args): """.""" # Load data cnarr = cnvlib.read(args.cnr_fname) segarr = cnvlib.read(args.cns_fname) acgharr = cnvlib.read(args.cghr_fname) asegarr = cnvlib.read(args.cghs_fname) # Create a figure grid w/ 2 axes, vertically stacked, labels sandwiched _fig = pyplot.figure(figsize=(10, 3.5)) axgrid = pyplot.GridSpec(2, 1, hspace=.37) topax = pyplot.subplot(axgrid[0]) botax = pyplot.subplot(axgrid[1], sharex=topax, sharey=topax) botax.tick_params(labelbottom=False) topax.tick_params(labelbottom=True) # Twiddle y-axis limits all_y = numpy.concatenate((segarr.autosomes().log2, asegarr.autosomes().log2)) topax.set_ylim(limit(min(all_y) - .2, -5.0, -.5), limit(max(all_y) + .2, .5, 5.0)) # Draw CNVkit and aCGH scatters plots.cnv_on_genome(botax, acgharr, asegarr, PAD) plots.cnv_on_genome(topax, cnarr, segarr, PAD) # Save it. if args.output: pyplot.savefig(args.output, format='pdf', bbox_inches=0) print("Wrote", args.output, file=sys.stderr) else: pyplot.show()
def test_by_segment(self): cnarr = cnvlib.read("formats/amplicon.cnr") segments = cnvlib.read("formats/amplicon.cns") count_segs = 0 for count_segs, (_seg, _bins) in enumerate(cnarr.by_segment(segments)): pass self.assertEqual(len(segments), count_segs + 1)
def read_paired_genes(cbs1, cbs2, interval): """Get the segment CN values for each targeted region. For genes with 2 or more segments, take the longest segment (or [weighted] average). """ segments1 = cnvlib.read(cbs1).autosomes() segments2 = cnvlib.read(cbs2).autosomes() non_overlapping = set(segments1.chromosome).symmetric_difference( set(segments2.chromosome)) if non_overlapping: raise ValueError("Mismatched chromosomes: " + ' '.join(sorted(non_overlapping))) segments1.sort() segments2.sort() genes = list(interval2genes(interval)) print("#Genes tiled:", len(genes), file=sys.stderr) has_chr = segments1.chromosome[0].startswith('chr') for chrom, start, end, name in genes: # for chrom, start, end, name in interval2genes(interval): if is_skipped_chromosome(chrom): continue if not has_chr: # Remove the 'chr' prefix from target gene chromosome name chrom = chrom[3:] sel1 = segments1.in_range(chrom, start, end, mode='trim') sel2 = segments2.in_range(chrom, start, end, mode='trim') if len(sel1) == 0 or len(sel2) == 0: print("Skipping", name, "-- not covered by a segment") continue val1 = segment_cn(sel1) val2 = segment_cn(sel2) yield (chrom, val1, val2, start, end, name)
def test_ranges(self): """Test range methods: by_ranges, in_range, in_ranges.""" cnarr = cnvlib.read("formats/amplicon.cnr") segarr = cnvlib.read("formats/amplicon.cns") chrom_segarr = dict(segarr.by_chromosome()) for chrom, subarr in cnarr.by_chromosome(): count_segs = 0 count_bins = 0 subsegarr = chrom_segarr[chrom] for count_segs, (seg, bins) in enumerate(subarr.by_ranges(subsegarr)): count_bins += len(bins) self.assertEqual(seg['probes'], len(bins)) self.assertEqual(len(bins), len( cnarr.in_range(seg['chromosome'], seg['start'], seg['end'], mode='outer'))) self.assertEqual(len(bins), len( cnarr.in_range(seg['chromosome'], seg['start'], seg['end'], mode='trim'))) self.assertEqual(len(subsegarr), count_segs + 1) self.assertEqual(len(subarr), count_bins) self.assertEqual(len(subarr), len( cnarr.in_ranges(chrom, subsegarr['start'], subsegarr['end'], mode="outer"))) self.assertEqual(len(subarr), len( subarr.in_ranges(starts=subsegarr['start'], ends=subsegarr['end'], mode="outer"))) self.assertEqual(len(subarr), len( cnarr.in_ranges(chrom, subsegarr['start'], subsegarr['end'], mode="trim"))) self.assertEqual(len(subarr), len( subarr.in_ranges(starts=subsegarr['start'], ends=subsegarr['end'], mode="trim")))
def test_batch(self): """The 'batch' command.""" target_bed = "formats/my-targets.bed" fasta = "formats/chrM-Y-trunc.hg19.fa" bam = "formats/na12878-chrM-Y-trunc.bam" annot = "formats/my-refflat.bed" # Build a single-sample WGS reference ref_fname, tgt_bed_fname, _ = batch.batch_make_reference( [bam], None, None, True, fasta, annot, True, 500, None, None, None, None, 'build', 1, False, "wgs") self.assertEqual(ref_fname, 'build/reference.cnn') refarr = cnvlib.read(ref_fname, 'bed') tgt_regions = tabio.read(tgt_bed_fname, 'bed') self.assertEqual(len(refarr), len(tgt_regions)) # Build a single-sample hybrid-capture reference ref_fname, tgt_bed_fname, anti_bed_fname = batch.batch_make_reference( [bam], target_bed, None, True, fasta, None, True, 10, None, 1000, 100, None, 'build', 1, False, "hybrid") self.assertEqual(ref_fname, 'build/reference.cnn') refarr = cnvlib.read(ref_fname, 'bed') tgt_regions = tabio.read(tgt_bed_fname, 'bed') anti_regions = tabio.read(anti_bed_fname, 'bed') self.assertEqual(len(refarr), len(tgt_regions) + len(anti_regions)) # Run the same sample batch.batch_run_sample( bam, tgt_bed_fname, anti_bed_fname, ref_fname, 'build', True, True, True, None, False, False, "hybrid", 1) cns = cnvlib.read("build/na12878-chrM-Y-trunc.cns") self.assertGreater(len(cns), 0)
def test_export(self): """Run the 'export' command with each format.""" # SEG seg_rows = export.export_seg(["formats/tr95t.cns"]) self.assertGreater(len(seg_rows), 0) seg2_rows = export.export_seg(["formats/tr95t.cns", "formats/cl_seq.cns"]) self.assertGreater(len(seg2_rows), len(seg_rows)) # THetA2 cnr = cnvlib.read("formats/tr95t.cns") theta_rows = export.export_theta(cnr, None) self.assertGreater(len(theta_rows), 0) ref = cnvlib.read("formats/reference-tr.cnn") theta_rows = export.export_theta(cnr, ref) self.assertGreater(len(theta_rows), 0) # Formats that calculate absolute copy number for fname, ploidy, is_f in [("tr95t.cns", 2, True), ("cl_seq.cns", 6, True), ("amplicon.cns", 2, False)]: cns = cnvlib.read("formats/" + fname) # BED self.assertLess(len(export.export_bed(cns, ploidy, True, is_f, cns.sample_id, "ploidy")), len(cns)) self.assertLess(len(export.export_bed(cns, ploidy, True, is_f, cns.sample_id, "variant")), len(cns)) self.assertEqual(len(export.export_bed(cns, ploidy, True, is_f, cns.sample_id, "all")), len(cns)) # VCF _vheader, vcf_body = export.export_vcf(cns, ploidy, True, is_f) self.assertTrue(0 < len(vcf_body.splitlines()) < len(cns))
def test_gainloss(self): """The 'gainloss' command.""" probes = cnvlib.read("formats/amplicon.cnr") rows = commands.do_gainloss(probes, male_reference=True) self.assertTrue(len(rows) > 0) segs = cnvlib.read("formats/amplicon.cns") rows = commands.do_gainloss(probes, segs, True, 0.3, 4) self.assertTrue(len(rows) > 0)
def test_genemetrics(self): """The 'genemetrics' command.""" probes = cnvlib.read("formats/amplicon.cnr") rows = commands.do_genemetrics(probes, male_reference=True) self.assertGreater(len(rows), 0) segs = cnvlib.read("formats/amplicon.cns") rows = commands.do_genemetrics(probes, segs, 0.3, 4, male_reference=True) self.assertGreater(len(rows), 0)
def test_metrics(self): """The 'metrics' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") segments = cnvlib.read("formats/amplicon.cns") resids = metrics.probe_deviations_from_segments(cnarr, segments) self.assertTrue(len(resids) <= len(cnarr)) values = metrics.ests_of_scale(resids) for val in values: self.assertTrue(val > 0)
def test_residuals(self): cnarr = cnvlib.read("formats/amplicon.cnr") segments = cnvlib.read("formats/amplicon.cns") regions = GenomicArray(segments.data).drop_extra_columns() for grouping_arg in (None, segments, regions): resid = cnarr.residuals(grouping_arg) self.assertAlmostEqual(0, resid.mean(), delta=.3) self.assertAlmostEqual(1, np.percentile(resid, 80), delta=.2) self.assertAlmostEqual(2, resid.std(), delta=.5)
def test_metrics(self): """The 'metrics' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") segments = cnvlib.read("formats/amplicon.cns") result = metrics.do_metrics(cnarr, segments, skip_low=True) self.assertEqual(result.shape, (1, 6)) values = result.loc[0, result.columns[1:]] for val in values: self.assertGreater(val, 0)
def test_metrics(self): """The 'metrics' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") segments = cnvlib.read("formats/amplicon.cns") resids = cnarr.residuals(segments) self.assertLessEqual(len(resids), len(cnarr)) values = metrics.ests_of_scale(resids) for val in values: self.assertGreater(val, 0)
def test_bintest(self): """The 'bintest' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") segarr = cnvlib.read("formats/amplicon.cns") # Simple rows = commands.do_bintest(cnarr, alpha=.05) self.assertGreater(len(rows), 0) self.assertLess(len(rows), len(cnarr)) # Versus segments rows = commands.do_bintest(cnarr, segarr, target_only=True) self.assertGreaterEqual(len(rows), len(segarr)) self.assertLess(len(rows), len(cnarr))
def _cmd_ztest(args): cnarr = cnvlib.read(args.cnarr) if args.segment: segments = cnvlib.read(args.segment) is_sample_female = None else: segments = None is_sample_female = verify_sample_sex(cnarr, args.sample_sex, args.male_reference) sig = do_ztest(cnarr, segments, args.male_reference, is_sample_female, args.alpha, args.target) if len(sig): tabio.write(sig, args.output or sys.stdout)
def test_segmetrics(self): """The 'segmetrics' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") segarr = cnvlib.read("formats/amplicon.cns") sm = segmetrics.do_segmetrics(cnarr, segarr, location_stats=['mean', 'median'], spread_stats=['stdev'], interval_stats=['pi', 'ci']) # Restrict to segments with enough supporting probes for sane stats sm = sm[sm['probes'] > 3] self.assertTrue((sm['pi_lo'] < sm['median']).all()) self.assertTrue((sm['pi_hi'] > sm['median']).all()) self.assertTrue((sm['ci_lo'] < sm['mean']).all()) self.assertTrue((sm['ci_hi'] > sm['mean']).all())
def test_segmetrics(self): """The 'segmetrics' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") segarr = cnvlib.read("formats/amplicon.cns") for func in (commands._confidence_interval, commands._prediction_interval): lo, hi = commands._segmetric_interval(segarr, cnarr, func) self.assertEqual(len(lo), len(segarr)) self.assertEqual(len(hi), len(segarr)) sensible_segs_mask = (np.asarray(segarr['probes']) > 3) means = segarr[sensible_segs_mask, 'log2'] los = lo[sensible_segs_mask] his = hi[sensible_segs_mask] self.assertTrue((los < means).all()) self.assertTrue((means < his).all())
def test_ranges_into(self): cnarr = read("formats/amplicon.cnr") segarr = read("formats/amplicon.cns") seg_genes = cnarr.into_ranges(segarr, 'gene', '-') self.assertEqual(len(seg_genes), len(segarr)) # With a VCF varr = tabio.read("formats/na12878_na12882_mix.vcf", "vcf") seg_baf = varr.into_ranges(segarr, 'alt_freq', np.nan, np.nanmedian) self.assertEqual(len(seg_baf), len(segarr)) cna_baf = varr.into_ranges(cnarr, 'alt_freq', 0.0, np.max) self.assertEqual(len(cna_baf), len(cnarr)) # Edge cases mtarr = tabio.read("formats/empty") segarr.into_ranges(mtarr, 'start', 0, int) mtarr.into_ranges(segarr, 'end', 0, 0)
def test_drop_extra_columns(self): """Test removal of optional 'gc' column.""" cna = cnvlib.read('formats/reference-tr.cnn') self.assertIn('gc', cna) cleaned = cna.drop_extra_columns() self.assertNotIn('gc', cleaned) self.assertTrue((cleaned['log2'] == cna['log2']).all())
def test_export_theta(self): """The 'export theta' command.""" segarr = cnvlib.read("formats/tr95t.cns") len_seg_auto = len(segarr.autosomes()) table_theta = export.export_theta(segarr, None) self.assertEqual(len(table_theta), len_seg_auto) ref = cnvlib.read("formats/reference-tr.cnn") table_theta = export.export_theta(segarr, ref) self.assertEqual(len(table_theta), len_seg_auto) varr = commands.load_het_snps("formats/na12878_na12882_mix.vcf", "NA12882", "NA12878", 15, None) tumor_snps, normal_snps = export.export_theta_snps(varr) self.assertLess(len(tumor_snps), len(varr)) self.assertGreater(len(tumor_snps), 0) self.assertLess(len(normal_snps), len(varr)) self.assertGreater(len(normal_snps), 0)
def test_basic(self): """Test basic container functionality and magic methods.""" cna = cnvlib.read('formats/reference-tr.cnn') # Length self.assertEqual(len(cna), linecount('formats/reference-tr.cnn') - 1) # Equality same = cnvlib.read('formats/reference-tr.cnn') self.assertEqual(cna, same) # Item access orig = cna[0] cna[0] = orig cna[3:4] = cna[3:4] cna[6:10] = cna[6:10] self.assertEqual(tuple(cna[0]), tuple(same[0])) self.assertEqual(cna[3:6], same[3:6])
def test_by_chromosome(self): for fname in ("formats/amplicon.cnr", "formats/cl_seq.cns"): cnarr = cnvlib.read(fname) row_count = 0 for _chrom, rows in cnarr.by_chromosome(): row_count += len(rows) self.assertEqual(row_count, len(cnarr))
def main(args): """Run the script.""" ref = cnvlib.read(args.reference) targets, antitargets = reference.reference2regions(ref) name = args.output or ref.sample_id write_bed(targets, name + '.target.bed') write_bed(antitargets, name + '.antitarget.bed')
def test_segment_parallel(self): """The 'segment' command, in parallel.""" cnarr = cnvlib.read("formats/amplicon.cnr") psegments = segmentation.do_segmentation(cnarr, "haar", processes=2) ssegments = segmentation.do_segmentation(cnarr, "haar", processes=1) self.assertEqual(psegments.data.shape, ssegments.data.shape) self.assertEqual(len(psegments.meta), len(ssegments.meta))
def test_segmetrics(self): """The 'segmetrics' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") segarr = cnvlib.read("formats/amplicon.cns") for func in ( lambda x: segmetrics.confidence_interval_bootstrap(x, 0.05, 100), lambda x: segmetrics.prediction_interval(x, 0.05), ): lo, hi = segmetrics.segmetric_interval(segarr, cnarr, func) self.assertEqual(len(lo), len(segarr)) self.assertEqual(len(hi), len(segarr)) sensible_segs_mask = (segarr['probes'] > 3).values means = segarr[sensible_segs_mask, 'log2'] los = lo[sensible_segs_mask] his = hi[sensible_segs_mask] self.assertTrue((los < means).all()) self.assertTrue((means < his).all())
def main(args): """*""" do_ratio = bool(args.reference) ref_pset = read(args.reference or args.no_reference) bias_func = get_bias_func(args.mode, ref_pset, read(args.filenames[0])) print("Sample \tRaw probes \tTrend line \tReduction (%)") if args.batch: plot_overlaid(args.filenames, ref_pset, bias_func, args.mode, do_ratio, args.color) else: plot_separate(args.filenames, ref_pset, bias_func, args.mode, do_ratio) if args.output: pyplot.savefig(args.output, format='pdf', bbox_inches=0) echo("Wrote", args.output) else: pyplot.show()
def test_gainloss(self): probes = cnvlib.read("formats/amplicon.cnr") rows = commands.do_gainloss(probes, male_reference=True) self.assertTrue(len(rows) > 0) segs = segmentation.do_segmentation("formats/amplicon.cnr", False, "haar") rows = commands.do_gainloss(probes, segs, True, 0.3, 4) self.assertTrue(len(rows) > 0)
def test_segment(self): """The 'segment' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") # R methods are in another script segments = segmentation.do_segmentation(cnarr, "haar") self.assertGreater(len(segments), 0) segments = segmentation.do_segmentation(cnarr, "haar", threshold=.001, skip_low=True) self.assertGreater(len(segments), 0)
def test_export_nexus(self): """The 'export nexus-basic' and 'nexus-ogt' commands.""" cnr = cnvlib.read("formats/amplicon.cnr") table_nb = export.export_nexus_basic(cnr) self.assertEqual(len(table_nb), len(cnr)) varr = commands.load_het_snps("formats/na12878_na12882_mix.vcf", None, None, 15, None) table_ogt = export.export_nexus_ogt(cnr, varr, 0.05) self.assertEqual(len(table_ogt), len(cnr))
def test_export(self): # SEG seg_rows = export.export_seg(["formats/tr95t.cns"]) self.assertTrue(len(seg_rows) > 0) seg2_rows = export.export_seg(["formats/tr95t.cns", "formats/cl_seq.cns"]) self.assertTrue(len(seg2_rows) > len(seg_rows)) # THetA2 _header, theta_rows = export.export_theta("formats/tr95t.cns", "formats/reference-tr.cnn") self.assertTrue(len(theta_rows) > 0) # VCF tr_cns = cnvlib.read("formats/tr95t.cns") _header, tr_vcf_body = export.export_vcf(tr_cns, 2, True, True) self.assertTrue(0 < len(tr_vcf_body.splitlines()) < len(tr_cns)) cl_cns = cnvlib.read("formats/cl_seq.cns") _header, cl_vcf_body = export.export_vcf(cl_cns, 6, True, True) self.assertTrue(0 < len(cl_vcf_body.splitlines()) < len(cl_cns))
def test_center_all(self): """Test recentering.""" cna = cnvlib.read('formats/reference-tr.cnn') # Median-centering an already median-centered array -> no change chr1 = cna.in_range('chr1') self.assertAlmostEqual(0, np.median(chr1['log2']), places=1) chr1.center_all() orig_chr1_cvg = np.median(chr1['log2']) self.assertAlmostEqual(0, orig_chr1_cvg) # Median-centering resets a shift away from the median chr1plus2 = chr1.copy() chr1plus2['log2'] += 2.0 chr1plus2.center_all() self.assertAlmostEqual(np.median(chr1plus2['log2']), orig_chr1_cvg) # Other methods for centering are similar for a CN-neutral chromosome for method in ("mean", "mode", "biweight"): cp = chr1.copy() cp.center_all(method) self.assertLess(abs(cp['log2'].median() - orig_chr1_cvg), 0.1)
def test_call_sex(self): """Test each 'call' method on allosomes.""" for (fname, sample_is_f, ref_is_m, chr1_expect, chrx_expect, chry_expect, chr1_cn, chrx_cn, chry_cn, ) in ( ("formats/f-on-f.cns", True, False, 0, 0, None, 2, 2, None), ("formats/f-on-m.cns", True, True, 0.585, 1, None, 3, 2, None), ("formats/m-on-f.cns", False, False, 0, -1, 0, 2, 1, 1), ("formats/m-on-m.cns", False, True, 0, 0, 0, 2, 1, 1), ): cns = cnvlib.read(fname) chr1_idx = (cns.chromosome == 'chr1') chrx_idx = (cns.chromosome == 'chrX') chry_idx = (cns.chromosome == 'chrY') def test_chrom_means(segments): self.assertEqual(chr1_cn, segments['cn'][chr1_idx].mean()) self.assertAlmostEqual(chr1_expect, segments['log2'][chr1_idx].mean(), 0) self.assertEqual(chrx_cn, segments['cn'][chrx_idx].mean()) self.assertAlmostEqual(chrx_expect, segments['log2'][chrx_idx].mean(), 0) if not sample_is_f: self.assertEqual(chry_cn, segments['cn'][chry_idx].mean()) self.assertAlmostEqual(chry_expect, segments['log2'][chry_idx].mean(), 0) # Call threshold cns_thresh = commands.do_call(cns, None, "threshold", is_reference_male=ref_is_m, is_sample_female=sample_is_f) test_chrom_means(cns_thresh) # Call clonal pure cns_clone = commands.do_call(cns, None, "clonal", is_reference_male=ref_is_m, is_sample_female=sample_is_f) test_chrom_means(cns_clone) # Call clonal barely-mixed cns_p99 = commands.do_call(cns, None, "clonal", purity=0.99, is_reference_male=ref_is_m, is_sample_female=sample_is_f) test_chrom_means(cns_p99)
def test_call_filter(self): segments = cnvlib.read("formats/tr95t.segmetrics.cns") variants = tabio.read("formats/na12878_na12882_mix.vcf", "vcf") # Each filter individually, then all filters together for filters in (['ampdel'], ['cn'], ['ci'], ['sem'], ['sem', 'cn', 'ampdel'], ['ci', 'cn']): result = commands.do_call(segments, variants, method="threshold", purity=.9, is_reference_male=True, is_sample_female=True, filters=filters) self.assertLessEqual(len(result), len(segments)) if 'ampdel' not in filters: # At least 1 segment per chromosome remains self.assertLessEqual(len(segments.chromosome.unique()), len(result)) for colname in 'baf', 'cn', 'cn1', 'cn2': self.assertIn(colname, result)
def test_reference(self): """The 'reference' command.""" # Empty/unspecified antitargets nlines = linecount("formats/amplicon.cnr") - 1 ref = commands.do_reference(["formats/amplicon.cnr"], ["formats/empty"]) self.assertEqual(len(ref), nlines) ref = commands.do_reference(["formats/amplicon.cnr"]) self.assertEqual(len(ref), nlines) # Empty/unspecified antitargets, flat reference nlines = linecount("formats/amplicon.bed") ref = commands.do_reference_flat("formats/amplicon.bed", "formats/empty") self.assertEqual(len(ref), nlines) ref = commands.do_reference_flat("formats/amplicon.bed") self.assertEqual(len(ref), nlines) # Misc ref = cnvlib.read('formats/reference-tr.cnn') targets, antitargets = reference.reference2regions(ref) self.assertLess(0, len(antitargets)) self.assertEqual(len(antitargets), (ref['gene'] == 'Background').sum()) self.assertEqual(len(targets), len(ref) - len(antitargets))
def get_sort_and_smoother(cna_fname, ref_arr, mode): """Make a sort_and_smooth func from example CNA and reference.""" ref_matched = fix.match_ref_to_probes(ref_arr, read(cna_fname)) if mode in ('gc', 'rmask'): biases = ref_matched[mode] elif mode == 'edge': biases = map(fix.make_edge_sorter(ref_matched, params.INSERT_SIZE), ref_arr) else: raise ValueError("Unknown mode: %s" % mode) def wrapped_sort_and_smooth(this_arr): """Sort and smooth.""" assert len(this_arr) == len(biases) biases, coverages = zip( *sorted(((bias, cvg) for bias, cvg in izip(biases, this_arr['coverage'])), key=lambda bc: bc[0])) # Smooth the biases cvg_fitted = rolling_median(coverages, .2) # Again! (for aesthetics) # cvg_fitted = smoothed(cvg_fitted, .05) # Print some stats coverages = np.asarray(coverages) orig_var = np.var(coverages) def improvement(fitvals): return 100 * (1 - (np.var(coverages - fitvals) / orig_var)) # print("Sample \tRaw probes \tTrend line \tReduction") print( this_arr.sample_id, "\t %.5f \t %.5f \t %.4f" % (orig_var, np.var(cvg_fitted), improvement(cvg_fitted))) return biases, coverages, cvg_fitted return wrapped_sort_and_smooth
def test_segment_hmm(self): """The 'segment' command with HMM methods.""" for fname in ("formats/amplicon.cnr", "formats/p2-20_1.cnr"): cnarr = cnvlib.read(fname) n_chroms = cnarr.chromosome.nunique() # NB: R methods are in another script; haar is pure-Python segments = segmentation.do_segmentation(cnarr, "hmm") self.assertGreater(len(segments), n_chroms) self.assertTrue((segments.start < segments.end).all()) segments = segmentation.do_segmentation(cnarr, "hmm-tumor", skip_low=True) self.assertGreater(len(segments), n_chroms) self.assertTrue((segments.start < segments.end).all()) segments = segmentation.do_segmentation(cnarr, "hmm-germline") self.assertGreater(len(segments), n_chroms) self.assertTrue((segments.start < segments.end).all()) varr = tabio.read("formats/na12878_na12882_mix.vcf", "vcf") segments = segmentation.do_segmentation(cnarr, "hmm", variants=varr) self.assertGreater(len(segments), n_chroms)
def test_empty(self): """Instantiate from an empty file.""" cnarr = cnvlib.read("formats/empty") self.assertEqual(len(cnarr), 0)
def setUp(self): self.tas_cnr = cnvlib.read('formats/amplicon.cnr') self.wgs_cnr = cnvlib.read('formats/wgs-chr17.cnr')
def load_cnx(fname, gene_info, min_weight=0, is_segment=False): """Load .cnr or .cns file, extract 'log2' and 'gene' columns. With `is_segment`, unpack genes in each segment of the input .cns file. Returns: Series of log2 ratios indexed by gene names. Example ------- :: idx Segments: | Midpoints: 0 0 90 0, 50 X 90, 99 1 100 200 100, 150, 199 2 200 2000 200, 201, 1000 X 3000 3 5000 5050 5000, 5020 4 5050 6000 5050, 5500 X 6600 >>> starts.searchsorted(gene_mids, 'right') array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5]) >>> ends.searchsorted(gene_mids, 'right') array([0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5]) X X X X <-gaps """ d = cnvlib.read(fname).autosomes().data if min_weight: ok_wt = d['weight'] >= min_weight d = d[ok_wt] print("Dropped", (~ok_wt).sum(), "rows with weight below", min_weight) # Drop genes that aren't also listed in the .cnr/.cns file? print("Filtering out bad gene names from gene_info") if 'probes' in d.columns: # It's segments -- multiple genes ok_gene_names = set() for x in d['gene'].str.split(','): ok_gene_names.update(x) else: ok_gene_names = d['gene'] mask_to_keep = gene_info['gene'].isin(ok_gene_names) print("Keeping", mask_to_keep.sum(), "/", len(mask_to_keep), "gene names in gene_info") gene_info = gene_info[mask_to_keep] chunks = [] for _chrom, info_rows, cnx_rows in by_shared_chroms(gene_info, d, False): info_midpoints = info_rows['midpoint'].values info_genes = info_rows['gene'].values # Locate which segments/bins each gene midpoint falls within # - Compare both start and end to ensure (start <= midpoint < end) # - If not, then skip that gene cnx_starts = cnx_rows['start'].values starts_idx = cnx_starts.searchsorted(info_midpoints, 'right') cnx_ends = cnx_rows['end'].values ends_idx = cnx_ends.searchsorted(info_midpoints, 'right') ok_genes_mask = (starts_idx == ends_idx + 1) genes_in_cnx_idx = starts_idx.take(ok_genes_mask.nonzero()[0]) - 1 gene_log2 = cnx_rows['log2'].values[genes_in_cnx_idx] gene_sizes = (cnx_ends - cnx_starts)[genes_in_cnx_idx] # Stash 'em, including gene name chunk_df = pd.DataFrame({ 'gene': info_genes[ok_genes_mask], 'log2': gene_log2, 'size': gene_sizes }) chunks.append(chunk_df) df = pd.concat(chunks) # Drop any rows genes with duplicate gene names if not df['gene'].is_unique: dup_idx = df['gene'].duplicated(keep=False) print("Found", dup_idx.sum(), "duplicated gene names in", fname, file=sys.stderr) df = df[~dup_idx] df = df.set_index('gene').sort_index() return basename(fname), df
"""Extract target and antitarget BED files from a CNVkit reference file. Once you have a stable CNVkit reference for your platform, you can use this script to drop the "bad" bins from your target and antitarget BED files and avoid unnecessarily calculating coverage in those bins during future runs. This script is also useful to recover the target and antitarget BED files that match the reference if those BED files are missing or you're not sure which ones are correct. """ import argparse import logging import cnvlib from cnvlib import reference from skgenome import tabio logging.basicConfig(level=logging.INFO, format="%(message)s") AP = argparse.ArgumentParser(description=__doc__) AP.add_argument("reference", help="Reference file.") AP.add_argument("-o", "--output", help="Output base name (extensions added automatically).") args = AP.parse_args() ref = cnvlib.read(args.reference) targets, antitargets = reference.reference2regions(ref) name = args.output or ref.sample_id tabio.write(targets, name + '.target.bed', 'bed4') tabio.write(antitargets, name + '.antitarget.bed', 'bed4')
#!/usr/bin/env python """Sum of arm-level absolute log2 values. Input: *.cns, with arm-level segmentation (segment -m none) Output: table """ from __future__ import division, print_function import sys import cnvlib for fname in sys.argv[1:]: cna = cnvlib.read(fname) stat = (cna.autosomes()['log2'].abs()**2).sum() print("%.2f" % stat, cna.sample_id, sep='\t')
def test_import_theta(self): """The 'import-theta' command.""" cns = cnvlib.read("formats/nv3.cns") theta_fname = "formats/nv3.n3.results" for new_cns in commands.do_import_theta(cns, theta_fname): self.assertTrue(0 < len(new_cns) <= len(cns))
def setUp(self): self.ex_cnr = cnvlib.read(EX_CNR)
def setUp(self): self.ex_cnr = cnvlib.read('formats/reference-tr.cnn')
This lets us skip recalculating GC and RepeatMasker values from the reference genome sequence when creating another CNVkit reference. Both CNVkit references must have the same number of rows (corresponding to the same positions). """ import argparse import sys import cnvlib AP = argparse.ArgumentParser(description=__doc__) AP.add_argument("orig") AP.add_argument("other") AP.add_argument("-o", "--output", type=argparse.FileType('w'), default=sys.stdout) args = AP.parse_args() orig_arr = cnvlib.read(args.orig) other_arr = cnvlib.read(args.other) assert len(other_arr) == len(orig_arr) other_arr["gc"] = orig_arr["gc"] other_arr["rmask"] = orig_arr["rmask"] other_arr.sort() other_arr.sort_columns() other_arr.write(args.output)
def clipped_rolling_mean(values, window): clipped = values.clip(-3, 3) smoothed = clipped.rolling(window, min_periods=1, center=True).mean() return smoothed.values def smooth_by_arm(cnarr, window): logr_chunks = [clipped_rolling_mean(cnarm['log2'], window) for _chrom, cnarm in cnarr.by_arm()] d = cnarr.data.assign(log2=np.concatenate(logr_chunks)) return cnarr.as_dataframe(d) AP = argparse.ArgumentParser(description=__doc__) AP.add_argument('cnr_fnames', nargs='+') AP.add_argument('-w', '--window', type=int, default=100, help="Window size for smoothing.") AP.add_argument('-d', '--output-dir', default='.') args = AP.parse_args() for fname in args.cnr_fnames: cnr = cnvlib.read(fname) cnr = smooth_by_arm(cnr, args.window) base, ext = os.path.basename(fname).rsplit(".", 1) outfname = "{}/{}.tsmooth{}.{}".format(args.output_dir, base, args.window, ext) tabio.write(cnr, outfname) print("Wrote", outfname, file=sys.stderr)
def main(args): """.""" # Load data cnarr = cnvlib.read(args.cnr_fname) # cnarr['weight'] = numpy.repeat(.78, len(cnarr)) segarr = cnvlib.read(args.cns_fname) acgharr = cnvlib.read(args.cghr_fname) asegarr = cnvlib.read(args.cghs_fname) # Find the genomic location matching the specified gene(s) gene_names = args.gene_name.split(',') gene_coords = plots.gene_coords_by_name(cnarr, gene_names) if not len(gene_coords) == 1: raise ValueError("Genes %s are split across chromosomes %s" % (args.gene_name, gene_coords.keys())) chrom, genes = gene_coords.popitem() genes.sort() # Set the display window to the selected genes +/- a margin window_coords = (genes[0][0] - args.window_width, genes[-1][1] + args.window_width) # Use plot_chromosome to draw CNVkit and aCGH scatters cnv_sel_probes, cnv_sel_segs = get_plot_args(cnarr, segarr, chrom, window_coords) acgh_sel_probes, acgh_sel_segs = get_plot_args(acgharr, asegarr, chrom, window_coords) # Create a figure grid w/ 2 side-by-side axes _fig = pyplot.figure(figsize=(3.5 * len(genes), 3.5)) axgrid = pyplot.GridSpec(1, 2, wspace=0) leftax = pyplot.subplot(axgrid[0]) rightax = pyplot.subplot(axgrid[1], sharex=leftax, sharey=leftax) plots.cnv_on_chromosome(leftax, cnv_sel_probes, cnv_sel_segs, genes) plots.cnv_on_chromosome(rightax, acgh_sel_probes, acgh_sel_segs, genes) # Tweak aesthetics rightax.tick_params(labelleft=False, left=False) leftax.tick_params(labelleft=True) leftax.set_xlabel("Position (Mb)") rightax.set_ylabel('') rightax.set_title('') # Rotate & cull x-axis (position) labels if len(genes) == 1: xlabels = get_xtick_values(acgh_sel_probes) leftax.set_xticks(xlabels) leftax.set_xticklabels(map(str, xlabels), rotation=60) rightax.set_xticks(xlabels) rightax.set_xticklabels(map(str, xlabels), rotation=60) # Set sensible y-axis limits # all_y = numpy.concatenate((cnv_sel_probes.coverage, # acgh_sel_probes.coverage)) # leftax.set_ylim(plots.limit(min(all_y) - .1, -5.0, -.3), # plots.limit(max(all_y) + .25, .3, 5.0)) if args.gene_name == 'CDKN2A': all_y = numpy.concatenate((cnv_sel_segs.log2, acgh_sel_segs.log2)) print("all_y:", tuple(all_y)) leftax.set_ylim(limit(min(all_y) - .3, -5.0, -.5), limit(max(all_y) + .3, .5, 5.0)) else: leftax.set_ylim(-2.1, 1.1) # Save it. if args.output: pyplot.savefig(args.output, format='pdf', bbox_inches='tight') print("Wrote", args.output, file=sys.stderr) else: pyplot.show()
def test_breaks(self): """The 'breaks' command.""" probes = cnvlib.read("formats/amplicon.cnr") segs = cnvlib.read("formats/amplicon.cns") rows = commands.do_breaks(probes, segs, 4) self.assertGreater(len(rows), 0)
help="""CNVkit coverage files to update (*.targetcoverage.cnn, *.antitargetcoverage.cnn).""") AP.add_argument("-d", "--output-dir", default=".", help="""Directory to write output .cnn files.""") AP.add_argument( "-s", "--suffix", default=".updated", help="""Filename suffix to add before the '.cnn' extension in output files. [Default: %(default)s]""") args = AP.parse_args() for fname in args.cnn_files: cnarr = cnvlib.read(fname) # Convert coverage depths from log2 scale to absolute scale. # NB: The log2 values are un-centered in CNVkit v0.7.0(?) through v0.7.11; # earlier than that, the average 'depth' will be about 1.0. cnarr['depth'] = np.exp2(cnarr['log2']) # Rename "Background" bins to "Antitarget" # NB: The default off-target bin name was changed in CNVkit v0.9.0 cnarr['gene'] = cnarr['gene'].replace("Background", cnvlib.params.ANTITARGET_NAME) cnarr.sort_columns() # Construct the output filename base, ext = os.path.basename(fname).rsplit('.', 1) if '.' in base: base, zone = base.rsplit('.', 1) out_fname = '.'.join((base + args.suffix, zone, ext)) else: