コード例 #1
0
ファイル: cut_segments.py プロジェクト: etal/cnvkit-examples
def read_paired_genes(cbs1, cbs2, interval):
    """Get the segment CN values for each targeted region.

    For genes with 2 or more segments, take the longest segment (or [weighted]
    average).
    """
    segments1 = cnvlib.read(cbs1)
    segments2 = cnvlib.read(cbs2)
    non_overlapping = set(segments1.chromosome).symmetric_difference(
            set(segments2.chromosome))
    non_overlapping = [chrom for chrom in non_overlapping
                       if not is_skipped_chromosome(chrom)]
    if non_overlapping:
        raise ValueError("Mismatched chromosomes: " +
                         ' '.join(sorted(non_overlapping)))
    segments1.sort()
    segments2.sort()

    for s1_chrom, s1_start, s1_end, s1_name, s1_value, s1_probes in segments1:
        if s1_probes < MIN_ACGH_PROBES or is_skipped_chromosome(s1_chrom):
            continue
        s1_name = "{}:{}-{}".format(s1_chrom, s1_start, s1_end)
        seglike2 = segments2.in_range(s1_chrom, s1_start, s1_end, trim=True)
        if len(seglike2) == 0:
            print("Skipping", s1_name, "-- covers no CNVkit segments")
            continue
        s2_value = segment_cn(seglike2)
        yield (s1_chrom, s1_value, s2_value, s1_start, s1_end, s1_name)
コード例 #2
0
def main(args):
    """."""
    # Load data
    cnarr = cnvlib.read(args.cnr_fname)
    segarr = cnvlib.read(args.cns_fname)
    acgharr = cnvlib.read(args.cghr_fname)
    asegarr = cnvlib.read(args.cghs_fname)

    # Create a figure grid w/ 2 axes, vertically stacked, labels sandwiched
    _fig = pyplot.figure(figsize=(10, 3.5))
    axgrid = pyplot.GridSpec(2, 1, hspace=.37)
    topax = pyplot.subplot(axgrid[0])
    botax = pyplot.subplot(axgrid[1], sharex=topax, sharey=topax)
    botax.tick_params(labelbottom=False)
    topax.tick_params(labelbottom=True)
    # Twiddle y-axis limits
    all_y = numpy.concatenate((segarr.autosomes().log2, asegarr.autosomes().log2))
    topax.set_ylim(limit(min(all_y) - .2, -5.0, -.5),
                   limit(max(all_y) + .2, .5, 5.0))

    # Draw CNVkit and aCGH scatters
    plots.cnv_on_genome(botax, acgharr, asegarr, PAD)
    plots.cnv_on_genome(topax, cnarr, segarr, PAD)

    # Save it.
    if args.output:
        pyplot.savefig(args.output, format='pdf', bbox_inches=0)
        print("Wrote", args.output, file=sys.stderr)
    else:
        pyplot.show()
コード例 #3
0
ファイル: test_cnvlib.py プロジェクト: nordhuang/cnvkit
 def test_by_segment(self):
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segments = cnvlib.read("formats/amplicon.cns")
     count_segs = 0
     for count_segs, (_seg, _bins) in enumerate(cnarr.by_segment(segments)):
         pass
     self.assertEqual(len(segments), count_segs + 1)
コード例 #4
0
ファイル: pair_segments.py プロジェクト: etal/cnvkit-examples
def read_paired_genes(cbs1, cbs2, interval):
    """Get the segment CN values for each targeted region.

    For genes with 2 or more segments, take the longest segment (or [weighted]
    average).
    """
    segments1 = cnvlib.read(cbs1).autosomes()
    segments2 = cnvlib.read(cbs2).autosomes()
    non_overlapping = set(segments1.chromosome).symmetric_difference(
            set(segments2.chromosome))
    if non_overlapping:
        raise ValueError("Mismatched chromosomes: " +
                         ' '.join(sorted(non_overlapping)))
    segments1.sort()
    segments2.sort()

    genes = list(interval2genes(interval))
    print("#Genes tiled:", len(genes), file=sys.stderr)

    has_chr = segments1.chromosome[0].startswith('chr')
    for chrom, start, end, name in genes:
    # for chrom, start, end, name in interval2genes(interval):
        if is_skipped_chromosome(chrom):
            continue
        if not has_chr:
            # Remove the 'chr' prefix from target gene chromosome name
            chrom = chrom[3:]
        sel1 = segments1.in_range(chrom, start, end, mode='trim')
        sel2 = segments2.in_range(chrom, start, end, mode='trim')
        if len(sel1) == 0 or len(sel2) == 0:
            print("Skipping", name, "-- not covered by a segment")
            continue
        val1 = segment_cn(sel1)
        val2 = segment_cn(sel2)
        yield (chrom, val1, val2, start, end, name)
コード例 #5
0
ファイル: test_cnvlib.py プロジェクト: kyleabeauchamp/cnvkit
 def test_ranges(self):
     """Test range methods: by_ranges, in_range, in_ranges."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segarr = cnvlib.read("formats/amplicon.cns")
     chrom_segarr = dict(segarr.by_chromosome())
     for chrom, subarr in cnarr.by_chromosome():
         count_segs = 0
         count_bins = 0
         subsegarr = chrom_segarr[chrom]
         for count_segs, (seg, bins) in enumerate(subarr.by_ranges(subsegarr)):
             count_bins += len(bins)
             self.assertEqual(seg['probes'], len(bins))
             self.assertEqual(len(bins), len(
                 cnarr.in_range(seg['chromosome'], seg['start'], seg['end'],
                             mode='outer')))
             self.assertEqual(len(bins), len(
                 cnarr.in_range(seg['chromosome'], seg['start'], seg['end'],
                             mode='trim')))
         self.assertEqual(len(subsegarr), count_segs + 1)
         self.assertEqual(len(subarr), count_bins)
         self.assertEqual(len(subarr), len(
             cnarr.in_ranges(chrom, subsegarr['start'], subsegarr['end'],
                             mode="outer")))
         self.assertEqual(len(subarr), len(
             subarr.in_ranges(starts=subsegarr['start'],
                              ends=subsegarr['end'], mode="outer")))
         self.assertEqual(len(subarr), len(
             cnarr.in_ranges(chrom, subsegarr['start'], subsegarr['end'],
                             mode="trim")))
         self.assertEqual(len(subarr), len(
             subarr.in_ranges(starts=subsegarr['start'],
                              ends=subsegarr['end'], mode="trim")))
コード例 #6
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_batch(self):
     """The 'batch' command."""
     target_bed = "formats/my-targets.bed"
     fasta = "formats/chrM-Y-trunc.hg19.fa"
     bam = "formats/na12878-chrM-Y-trunc.bam"
     annot = "formats/my-refflat.bed"
     # Build a single-sample WGS reference
     ref_fname, tgt_bed_fname, _ = batch.batch_make_reference(
         [bam], None, None, True, fasta, annot, True, 500, None, None,
         None, None, 'build', 1, False, "wgs")
     self.assertEqual(ref_fname, 'build/reference.cnn')
     refarr = cnvlib.read(ref_fname, 'bed')
     tgt_regions = tabio.read(tgt_bed_fname, 'bed')
     self.assertEqual(len(refarr), len(tgt_regions))
     # Build a single-sample hybrid-capture reference
     ref_fname, tgt_bed_fname, anti_bed_fname = batch.batch_make_reference(
         [bam], target_bed, None, True, fasta, None, True, 10, None, 1000,
         100, None, 'build', 1, False, "hybrid")
     self.assertEqual(ref_fname, 'build/reference.cnn')
     refarr = cnvlib.read(ref_fname, 'bed')
     tgt_regions = tabio.read(tgt_bed_fname, 'bed')
     anti_regions = tabio.read(anti_bed_fname, 'bed')
     self.assertEqual(len(refarr), len(tgt_regions) + len(anti_regions))
     # Run the same sample
     batch.batch_run_sample(
         bam, tgt_bed_fname, anti_bed_fname, ref_fname, 'build', True,
         True, True, None, False, False, "hybrid", 1)
     cns =  cnvlib.read("build/na12878-chrM-Y-trunc.cns")
     self.assertGreater(len(cns), 0)
コード例 #7
0
ファイル: test_cnvlib.py プロジェクト: mpschr/cnvkit
 def test_export(self):
     """Run the 'export' command with each format."""
     # SEG
     seg_rows = export.export_seg(["formats/tr95t.cns"])
     self.assertGreater(len(seg_rows), 0)
     seg2_rows = export.export_seg(["formats/tr95t.cns",
                                    "formats/cl_seq.cns"])
     self.assertGreater(len(seg2_rows), len(seg_rows))
     # THetA2
     cnr = cnvlib.read("formats/tr95t.cns")
     theta_rows = export.export_theta(cnr, None)
     self.assertGreater(len(theta_rows), 0)
     ref = cnvlib.read("formats/reference-tr.cnn")
     theta_rows = export.export_theta(cnr, ref)
     self.assertGreater(len(theta_rows), 0)
     # Formats that calculate absolute copy number
     for fname, ploidy, is_f in [("tr95t.cns", 2, True),
                                 ("cl_seq.cns", 6, True),
                                 ("amplicon.cns", 2, False)]:
         cns = cnvlib.read("formats/" + fname)
         # BED
         self.assertLess(len(export.export_bed(cns, ploidy, True, is_f,
                                               cns.sample_id, "ploidy")),
                         len(cns))
         self.assertLess(len(export.export_bed(cns, ploidy, True, is_f,
                                               cns.sample_id, "variant")),
                         len(cns))
         self.assertEqual(len(export.export_bed(cns, ploidy, True, is_f,
                                                cns.sample_id, "all")),
                          len(cns))
         # VCF
         _vheader, vcf_body = export.export_vcf(cns, ploidy, True, is_f)
         self.assertTrue(0 < len(vcf_body.splitlines()) < len(cns))
コード例 #8
0
ファイル: test_cnvlib.py プロジェクト: nordhuang/cnvkit
 def test_gainloss(self):
     """The 'gainloss' command."""
     probes = cnvlib.read("formats/amplicon.cnr")
     rows = commands.do_gainloss(probes, male_reference=True)
     self.assertTrue(len(rows) > 0)
     segs = cnvlib.read("formats/amplicon.cns")
     rows = commands.do_gainloss(probes, segs, True, 0.3, 4)
     self.assertTrue(len(rows) > 0)
コード例 #9
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_genemetrics(self):
     """The 'genemetrics' command."""
     probes = cnvlib.read("formats/amplicon.cnr")
     rows = commands.do_genemetrics(probes, male_reference=True)
     self.assertGreater(len(rows), 0)
     segs = cnvlib.read("formats/amplicon.cns")
     rows = commands.do_genemetrics(probes, segs, 0.3, 4, male_reference=True)
     self.assertGreater(len(rows), 0)
コード例 #10
0
ファイル: test_cnvlib.py プロジェクト: nordhuang/cnvkit
 def test_metrics(self):
     """The 'metrics' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segments = cnvlib.read("formats/amplicon.cns")
     resids = metrics.probe_deviations_from_segments(cnarr, segments)
     self.assertTrue(len(resids) <= len(cnarr))
     values = metrics.ests_of_scale(resids)
     for val in values:
         self.assertTrue(val > 0)
コード例 #11
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_residuals(self):
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segments = cnvlib.read("formats/amplicon.cns")
     regions = GenomicArray(segments.data).drop_extra_columns()
     for grouping_arg in (None, segments, regions):
         resid = cnarr.residuals(grouping_arg)
         self.assertAlmostEqual(0, resid.mean(), delta=.3)
         self.assertAlmostEqual(1, np.percentile(resid, 80), delta=.2)
         self.assertAlmostEqual(2, resid.std(), delta=.5)
コード例 #12
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_metrics(self):
     """The 'metrics' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segments = cnvlib.read("formats/amplicon.cns")
     result = metrics.do_metrics(cnarr, segments, skip_low=True)
     self.assertEqual(result.shape, (1, 6))
     values = result.loc[0, result.columns[1:]]
     for val in values:
         self.assertGreater(val, 0)
コード例 #13
0
ファイル: test_cnvlib.py プロジェクト: kyleabeauchamp/cnvkit
 def test_metrics(self):
     """The 'metrics' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segments = cnvlib.read("formats/amplicon.cns")
     resids = cnarr.residuals(segments)
     self.assertLessEqual(len(resids), len(cnarr))
     values = metrics.ests_of_scale(resids)
     for val in values:
         self.assertGreater(val, 0)
コード例 #14
0
ファイル: test_commands.py プロジェクト: etal/cnvkit
 def test_bintest(self):
     """The 'bintest' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segarr = cnvlib.read("formats/amplicon.cns")
     # Simple
     rows = commands.do_bintest(cnarr, alpha=.05)
     self.assertGreater(len(rows), 0)
     self.assertLess(len(rows), len(cnarr))
     # Versus segments
     rows = commands.do_bintest(cnarr, segarr, target_only=True)
     self.assertGreaterEqual(len(rows), len(segarr))
     self.assertLess(len(rows), len(cnarr))
コード例 #15
0
ファイル: cnv_ztest.py プロジェクト: chapmanb/cnvkit
def _cmd_ztest(args):
    cnarr = cnvlib.read(args.cnarr)
    if args.segment:
        segments = cnvlib.read(args.segment)
        is_sample_female = None
    else:
        segments = None
        is_sample_female = verify_sample_sex(cnarr, args.sample_sex,
                                             args.male_reference)
    sig = do_ztest(cnarr, segments, args.male_reference, is_sample_female,
                   args.alpha, args.target)
    if len(sig):
        tabio.write(sig, args.output or sys.stdout)
コード例 #16
0
ファイル: test_cnvlib.py プロジェクト: chapmanb/cnvkit
 def test_segmetrics(self):
     """The 'segmetrics' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segarr = cnvlib.read("formats/amplicon.cns")
     sm = segmetrics.do_segmetrics(cnarr, segarr,
                                   location_stats=['mean', 'median'],
                                   spread_stats=['stdev'],
                                   interval_stats=['pi', 'ci'])
     # Restrict to segments with enough supporting probes for sane stats
     sm = sm[sm['probes'] > 3]
     self.assertTrue((sm['pi_lo'] < sm['median']).all())
     self.assertTrue((sm['pi_hi'] > sm['median']).all())
     self.assertTrue((sm['ci_lo'] < sm['mean']).all())
     self.assertTrue((sm['ci_hi'] > sm['mean']).all())
コード例 #17
0
ファイル: test_cnvlib.py プロジェクト: nordhuang/cnvkit
 def test_segmetrics(self):
     """The 'segmetrics' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segarr = cnvlib.read("formats/amplicon.cns")
     for func in (commands._confidence_interval,
                  commands._prediction_interval):
         lo, hi = commands._segmetric_interval(segarr, cnarr, func)
         self.assertEqual(len(lo), len(segarr))
         self.assertEqual(len(hi), len(segarr))
         sensible_segs_mask = (np.asarray(segarr['probes']) > 3)
         means = segarr[sensible_segs_mask, 'log2']
         los = lo[sensible_segs_mask]
         his = hi[sensible_segs_mask]
         self.assertTrue((los < means).all())
         self.assertTrue((means < his).all())
コード例 #18
0
ファイル: test_genome.py プロジェクト: JimmyLiJing/cnvkit
 def test_ranges_into(self):
     cnarr = read("formats/amplicon.cnr")
     segarr = read("formats/amplicon.cns")
     seg_genes = cnarr.into_ranges(segarr, 'gene', '-')
     self.assertEqual(len(seg_genes), len(segarr))
     # With a VCF
     varr = tabio.read("formats/na12878_na12882_mix.vcf", "vcf")
     seg_baf = varr.into_ranges(segarr, 'alt_freq', np.nan, np.nanmedian)
     self.assertEqual(len(seg_baf), len(segarr))
     cna_baf = varr.into_ranges(cnarr, 'alt_freq', 0.0, np.max)
     self.assertEqual(len(cna_baf), len(cnarr))
     # Edge cases
     mtarr = tabio.read("formats/empty")
     segarr.into_ranges(mtarr, 'start', 0, int)
     mtarr.into_ranges(segarr, 'end', 0, 0)
コード例 #19
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_drop_extra_columns(self):
     """Test removal of optional 'gc' column."""
     cna = cnvlib.read('formats/reference-tr.cnn')
     self.assertIn('gc', cna)
     cleaned = cna.drop_extra_columns()
     self.assertNotIn('gc', cleaned)
     self.assertTrue((cleaned['log2'] == cna['log2']).all())
コード例 #20
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_export_theta(self):
     """The 'export theta' command."""
     segarr = cnvlib.read("formats/tr95t.cns")
     len_seg_auto = len(segarr.autosomes())
     table_theta = export.export_theta(segarr, None)
     self.assertEqual(len(table_theta), len_seg_auto)
     ref = cnvlib.read("formats/reference-tr.cnn")
     table_theta = export.export_theta(segarr, ref)
     self.assertEqual(len(table_theta), len_seg_auto)
     varr = commands.load_het_snps("formats/na12878_na12882_mix.vcf",
                                   "NA12882", "NA12878", 15, None)
     tumor_snps, normal_snps = export.export_theta_snps(varr)
     self.assertLess(len(tumor_snps), len(varr))
     self.assertGreater(len(tumor_snps), 0)
     self.assertLess(len(normal_snps), len(varr))
     self.assertGreater(len(normal_snps), 0)
コード例 #21
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_basic(self):
     """Test basic container functionality and magic methods."""
     cna = cnvlib.read('formats/reference-tr.cnn')
     # Length
     self.assertEqual(len(cna),
                      linecount('formats/reference-tr.cnn') - 1)
     # Equality
     same = cnvlib.read('formats/reference-tr.cnn')
     self.assertEqual(cna, same)
     # Item access
     orig = cna[0]
     cna[0] = orig
     cna[3:4] = cna[3:4]
     cna[6:10] = cna[6:10]
     self.assertEqual(tuple(cna[0]), tuple(same[0]))
     self.assertEqual(cna[3:6], same[3:6])
コード例 #22
0
ファイル: test_cnvlib.py プロジェクト: zengfengbo/cnvkit
 def test_by_chromosome(self):
     for fname in ("formats/amplicon.cnr", "formats/cl_seq.cns"):
         cnarr = cnvlib.read(fname)
         row_count = 0
         for _chrom, rows in cnarr.by_chromosome():
             row_count += len(rows)
         self.assertEqual(row_count, len(cnarr))
コード例 #23
0
def main(args):
    """Run the script."""
    ref = cnvlib.read(args.reference)
    targets, antitargets = reference.reference2regions(ref)
    name = args.output or ref.sample_id
    write_bed(targets, name + '.target.bed')
    write_bed(antitargets, name + '.antitarget.bed')
コード例 #24
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_segment_parallel(self):
     """The 'segment' command, in parallel."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     psegments = segmentation.do_segmentation(cnarr, "haar", processes=2)
     ssegments = segmentation.do_segmentation(cnarr, "haar", processes=1)
     self.assertEqual(psegments.data.shape, ssegments.data.shape)
     self.assertEqual(len(psegments.meta), len(ssegments.meta))
コード例 #25
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_segmetrics(self):
     """The 'segmetrics' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     segarr = cnvlib.read("formats/amplicon.cns")
     for func in (
         lambda x: segmetrics.confidence_interval_bootstrap(x, 0.05, 100),
         lambda x: segmetrics.prediction_interval(x, 0.05),
     ):
         lo, hi = segmetrics.segmetric_interval(segarr, cnarr, func)
         self.assertEqual(len(lo), len(segarr))
         self.assertEqual(len(hi), len(segarr))
         sensible_segs_mask = (segarr['probes'] > 3).values
         means = segarr[sensible_segs_mask, 'log2']
         los = lo[sensible_segs_mask]
         his = hi[sensible_segs_mask]
         self.assertTrue((los < means).all())
         self.assertTrue((means < his).all())
コード例 #26
0
ファイル: plot_cnv_bias.py プロジェクト: etal/cnvkit-examples
def main(args):
    """*"""
    do_ratio = bool(args.reference)
    ref_pset = read(args.reference or args.no_reference)
    bias_func = get_bias_func(args.mode, ref_pset, read(args.filenames[0]))

    print("Sample \tRaw probes \tTrend line \tReduction (%)")
    if args.batch:
        plot_overlaid(args.filenames, ref_pset, bias_func, args.mode, do_ratio, args.color)
    else:
        plot_separate(args.filenames, ref_pset, bias_func, args.mode, do_ratio)

    if args.output:
        pyplot.savefig(args.output, format='pdf', bbox_inches=0)
        echo("Wrote", args.output)
    else:
        pyplot.show()
コード例 #27
0
ファイル: test_cnvlib.py プロジェクト: zengfengbo/cnvkit
 def test_gainloss(self):
     probes = cnvlib.read("formats/amplicon.cnr")
     rows = commands.do_gainloss(probes, male_reference=True)
     self.assertTrue(len(rows) > 0)
     segs = segmentation.do_segmentation("formats/amplicon.cnr", False,
                                         "haar")
     rows = commands.do_gainloss(probes, segs, True, 0.3, 4)
     self.assertTrue(len(rows) > 0)
コード例 #28
0
ファイル: test_cnvlib.py プロジェクト: kyleabeauchamp/cnvkit
 def test_segment(self):
     """The 'segment' command."""
     cnarr = cnvlib.read("formats/amplicon.cnr")
     # R methods are in another script
     segments = segmentation.do_segmentation(cnarr, "haar")
     self.assertGreater(len(segments), 0)
     segments = segmentation.do_segmentation(cnarr, "haar", threshold=.001,
                                             skip_low=True)
     self.assertGreater(len(segments), 0)
コード例 #29
0
ファイル: test_cnvlib.py プロジェクト: JimmyLiJing/cnvkit
 def test_export_nexus(self):
     """The 'export nexus-basic' and 'nexus-ogt' commands."""
     cnr = cnvlib.read("formats/amplicon.cnr")
     table_nb = export.export_nexus_basic(cnr)
     self.assertEqual(len(table_nb), len(cnr))
     varr = commands.load_het_snps("formats/na12878_na12882_mix.vcf",
                                   None, None, 15, None)
     table_ogt = export.export_nexus_ogt(cnr, varr, 0.05)
     self.assertEqual(len(table_ogt), len(cnr))
コード例 #30
0
ファイル: test_cnvlib.py プロジェクト: zengfengbo/cnvkit
 def test_export(self):
     # SEG
     seg_rows = export.export_seg(["formats/tr95t.cns"])
     self.assertTrue(len(seg_rows) > 0)
     seg2_rows = export.export_seg(["formats/tr95t.cns",
                                    "formats/cl_seq.cns"])
     self.assertTrue(len(seg2_rows) > len(seg_rows))
     # THetA2
     _header, theta_rows = export.export_theta("formats/tr95t.cns",
                                               "formats/reference-tr.cnn")
     self.assertTrue(len(theta_rows) > 0)
     # VCF
     tr_cns = cnvlib.read("formats/tr95t.cns")
     _header, tr_vcf_body = export.export_vcf(tr_cns, 2, True, True)
     self.assertTrue(0 < len(tr_vcf_body.splitlines()) < len(tr_cns))
     cl_cns = cnvlib.read("formats/cl_seq.cns")
     _header, cl_vcf_body = export.export_vcf(cl_cns, 6, True, True)
     self.assertTrue(0 < len(cl_vcf_body.splitlines()) < len(cl_cns))
コード例 #31
0
 def test_center_all(self):
     """Test recentering."""
     cna = cnvlib.read('formats/reference-tr.cnn')
     # Median-centering an already median-centered array -> no change
     chr1 = cna.in_range('chr1')
     self.assertAlmostEqual(0, np.median(chr1['log2']), places=1)
     chr1.center_all()
     orig_chr1_cvg = np.median(chr1['log2'])
     self.assertAlmostEqual(0, orig_chr1_cvg)
     # Median-centering resets a shift away from the median
     chr1plus2 = chr1.copy()
     chr1plus2['log2'] += 2.0
     chr1plus2.center_all()
     self.assertAlmostEqual(np.median(chr1plus2['log2']), orig_chr1_cvg)
     # Other methods for centering are similar for a CN-neutral chromosome
     for method in ("mean", "mode", "biweight"):
         cp = chr1.copy()
         cp.center_all(method)
         self.assertLess(abs(cp['log2'].median() - orig_chr1_cvg), 0.1)
コード例 #32
0
ファイル: test_cnvlib.py プロジェクト: weizhiting/cnvkit
    def test_call_sex(self):
        """Test each 'call' method on allosomes."""
        for (fname, sample_is_f, ref_is_m,
             chr1_expect, chrx_expect, chry_expect,
             chr1_cn, chrx_cn, chry_cn,
            ) in (
                ("formats/f-on-f.cns", True, False, 0, 0, None, 2, 2, None),
                ("formats/f-on-m.cns", True, True, 0.585, 1, None, 3, 2, None),
                ("formats/m-on-f.cns", False, False, 0, -1, 0, 2, 1, 1),
                ("formats/m-on-m.cns", False, True, 0, 0, 0, 2, 1, 1),
            ):
            cns = cnvlib.read(fname)
            chr1_idx = (cns.chromosome == 'chr1')
            chrx_idx = (cns.chromosome == 'chrX')
            chry_idx = (cns.chromosome == 'chrY')
            def test_chrom_means(segments):
                self.assertEqual(chr1_cn, segments['cn'][chr1_idx].mean())
                self.assertAlmostEqual(chr1_expect,
                                       segments['log2'][chr1_idx].mean(), 0)
                self.assertEqual(chrx_cn, segments['cn'][chrx_idx].mean())
                self.assertAlmostEqual(chrx_expect,
                                       segments['log2'][chrx_idx].mean(), 0)
                if not sample_is_f:
                    self.assertEqual(chry_cn, segments['cn'][chry_idx].mean())
                    self.assertAlmostEqual(chry_expect,
                                           segments['log2'][chry_idx].mean(), 0)

            # Call threshold
            cns_thresh = commands.do_call(cns, None, "threshold",
                                 is_reference_male=ref_is_m,
                                 is_sample_female=sample_is_f)
            test_chrom_means(cns_thresh)
            # Call clonal pure
            cns_clone = commands.do_call(cns, None, "clonal",
                                is_reference_male=ref_is_m,
                                is_sample_female=sample_is_f)
            test_chrom_means(cns_clone)
            # Call clonal barely-mixed
            cns_p99 = commands.do_call(cns, None, "clonal", purity=0.99,
                              is_reference_male=ref_is_m,
                              is_sample_female=sample_is_f)
            test_chrom_means(cns_p99)
コード例 #33
0
 def test_call_filter(self):
     segments = cnvlib.read("formats/tr95t.segmetrics.cns")
     variants = tabio.read("formats/na12878_na12882_mix.vcf", "vcf")
     # Each filter individually, then all filters together
     for filters in (['ampdel'], ['cn'], ['ci'], ['sem'],
                     ['sem', 'cn', 'ampdel'], ['ci', 'cn']):
         result = commands.do_call(segments,
                                   variants,
                                   method="threshold",
                                   purity=.9,
                                   is_reference_male=True,
                                   is_sample_female=True,
                                   filters=filters)
         self.assertLessEqual(len(result), len(segments))
         if 'ampdel' not in filters:
             # At least 1 segment per chromosome remains
             self.assertLessEqual(len(segments.chromosome.unique()),
                                  len(result))
         for colname in 'baf', 'cn', 'cn1', 'cn2':
             self.assertIn(colname, result)
コード例 #34
0
ファイル: test_cnvlib.py プロジェクト: weizhiting/cnvkit
 def test_reference(self):
     """The 'reference' command."""
     # Empty/unspecified antitargets
     nlines = linecount("formats/amplicon.cnr") - 1
     ref = commands.do_reference(["formats/amplicon.cnr"], ["formats/empty"])
     self.assertEqual(len(ref), nlines)
     ref = commands.do_reference(["formats/amplicon.cnr"])
     self.assertEqual(len(ref), nlines)
     # Empty/unspecified antitargets, flat reference
     nlines = linecount("formats/amplicon.bed")
     ref = commands.do_reference_flat("formats/amplicon.bed",
                                      "formats/empty")
     self.assertEqual(len(ref), nlines)
     ref = commands.do_reference_flat("formats/amplicon.bed")
     self.assertEqual(len(ref), nlines)
     # Misc
     ref = cnvlib.read('formats/reference-tr.cnn')
     targets, antitargets = reference.reference2regions(ref)
     self.assertLess(0, len(antitargets))
     self.assertEqual(len(antitargets), (ref['gene'] == 'Background').sum())
     self.assertEqual(len(targets), len(ref) - len(antitargets))
コード例 #35
0
def get_sort_and_smoother(cna_fname, ref_arr, mode):
    """Make a sort_and_smooth func from example CNA and reference."""
    ref_matched = fix.match_ref_to_probes(ref_arr, read(cna_fname))

    if mode in ('gc', 'rmask'):
        biases = ref_matched[mode]
    elif mode == 'edge':
        biases = map(fix.make_edge_sorter(ref_matched, params.INSERT_SIZE),
                     ref_arr)
    else:
        raise ValueError("Unknown mode: %s" % mode)

    def wrapped_sort_and_smooth(this_arr):
        """Sort and smooth."""
        assert len(this_arr) == len(biases)

        biases, coverages = zip(
            *sorted(((bias, cvg)
                     for bias, cvg in izip(biases, this_arr['coverage'])),
                    key=lambda bc: bc[0]))
        # Smooth the biases
        cvg_fitted = rolling_median(coverages, .2)
        # Again! (for aesthetics)
        # cvg_fitted = smoothed(cvg_fitted, .05)

        # Print some stats
        coverages = np.asarray(coverages)
        orig_var = np.var(coverages)

        def improvement(fitvals):
            return 100 * (1 - (np.var(coverages - fitvals) / orig_var))

        # print("Sample \tRaw probes \tTrend line \tReduction")
        print(
            this_arr.sample_id, "\t %.5f    \t %.5f    \t %.4f" %
            (orig_var, np.var(cvg_fitted), improvement(cvg_fitted)))
        return biases, coverages, cvg_fitted

    return wrapped_sort_and_smooth
コード例 #36
0
 def test_segment_hmm(self):
     """The 'segment' command with HMM methods."""
     for fname in ("formats/amplicon.cnr", "formats/p2-20_1.cnr"):
         cnarr = cnvlib.read(fname)
         n_chroms = cnarr.chromosome.nunique()
         # NB: R methods are in another script; haar is pure-Python
         segments = segmentation.do_segmentation(cnarr, "hmm")
         self.assertGreater(len(segments), n_chroms)
         self.assertTrue((segments.start < segments.end).all())
         segments = segmentation.do_segmentation(cnarr,
                                                 "hmm-tumor",
                                                 skip_low=True)
         self.assertGreater(len(segments), n_chroms)
         self.assertTrue((segments.start < segments.end).all())
         segments = segmentation.do_segmentation(cnarr, "hmm-germline")
         self.assertGreater(len(segments), n_chroms)
         self.assertTrue((segments.start < segments.end).all())
         varr = tabio.read("formats/na12878_na12882_mix.vcf", "vcf")
         segments = segmentation.do_segmentation(cnarr,
                                                 "hmm",
                                                 variants=varr)
         self.assertGreater(len(segments), n_chroms)
コード例 #37
0
 def test_empty(self):
     """Instantiate from an empty file."""
     cnarr = cnvlib.read("formats/empty")
     self.assertEqual(len(cnarr), 0)
コード例 #38
0
ファイル: test_r.py プロジェクト: raonyguimaraes/cnvkit
 def setUp(self):
     self.tas_cnr = cnvlib.read('formats/amplicon.cnr')
     self.wgs_cnr = cnvlib.read('formats/wgs-chr17.cnr')
コード例 #39
0
def load_cnx(fname, gene_info, min_weight=0, is_segment=False):
    """Load .cnr or .cns file, extract 'log2' and 'gene' columns.

    With `is_segment`, unpack genes in each segment of the input .cns file.

    Returns: Series of log2 ratios indexed by gene names.

    Example
    -------

    ::

    idx     Segments:     | Midpoints:
    0       0       90      0, 50
    X                       90, 99
    1       100     200     100, 150, 199
    2       200     2000    200, 201, 1000
    X                       3000
    3       5000    5050    5000, 5020
    4       5050    6000    5050, 5500
    X                       6600

    >>> starts.searchsorted(gene_mids, 'right')
    array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 5])
    >>> ends.searchsorted(gene_mids, 'right')
    array([0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5])
                 X  X                    X              X   <-gaps

    """
    d = cnvlib.read(fname).autosomes().data
    if min_weight:
        ok_wt = d['weight'] >= min_weight
        d = d[ok_wt]
        print("Dropped", (~ok_wt).sum(), "rows with weight below", min_weight)

    # Drop genes that aren't also listed in the .cnr/.cns file?
    print("Filtering out bad gene names from gene_info")
    if 'probes' in d.columns:
        # It's segments -- multiple genes
        ok_gene_names = set()
        for x in d['gene'].str.split(','):
            ok_gene_names.update(x)
    else:
        ok_gene_names = d['gene']
    mask_to_keep = gene_info['gene'].isin(ok_gene_names)
    print("Keeping", mask_to_keep.sum(), "/", len(mask_to_keep),
          "gene names in gene_info")
    gene_info = gene_info[mask_to_keep]

    chunks = []
    for _chrom, info_rows, cnx_rows in by_shared_chroms(gene_info, d, False):
        info_midpoints = info_rows['midpoint'].values
        info_genes = info_rows['gene'].values
        # Locate which segments/bins each gene midpoint falls within
        # - Compare both start and end to ensure (start <= midpoint < end)
        # - If not, then skip that gene
        cnx_starts = cnx_rows['start'].values
        starts_idx = cnx_starts.searchsorted(info_midpoints, 'right')
        cnx_ends = cnx_rows['end'].values
        ends_idx = cnx_ends.searchsorted(info_midpoints, 'right')
        ok_genes_mask = (starts_idx == ends_idx + 1)
        genes_in_cnx_idx = starts_idx.take(ok_genes_mask.nonzero()[0]) - 1
        gene_log2 = cnx_rows['log2'].values[genes_in_cnx_idx]
        gene_sizes = (cnx_ends - cnx_starts)[genes_in_cnx_idx]
        # Stash 'em, including gene name
        chunk_df = pd.DataFrame({
            'gene': info_genes[ok_genes_mask],
            'log2': gene_log2,
            'size': gene_sizes
        })
        chunks.append(chunk_df)

    df = pd.concat(chunks)
    # Drop any rows genes with duplicate gene names
    if not df['gene'].is_unique:
        dup_idx = df['gene'].duplicated(keep=False)
        print("Found",
              dup_idx.sum(),
              "duplicated gene names in",
              fname,
              file=sys.stderr)
        df = df[~dup_idx]
    df = df.set_index('gene').sort_index()
    return basename(fname), df
コード例 #40
0
"""Extract target and antitarget BED files from a CNVkit reference file.

Once you have a stable CNVkit reference for your platform, you can use this
script to drop the "bad" bins from your target and antitarget BED files and
avoid unnecessarily calculating coverage in those bins during future runs.

This script is also useful to recover the target and antitarget BED files that
match the reference if those BED files are missing or you're not sure which ones
are correct.
"""
import argparse
import logging

import cnvlib
from cnvlib import reference
from skgenome import tabio

logging.basicConfig(level=logging.INFO, format="%(message)s")


AP = argparse.ArgumentParser(description=__doc__)
AP.add_argument("reference", help="Reference file.")
AP.add_argument("-o", "--output",
                help="Output base name (extensions added automatically).")
args = AP.parse_args()
ref = cnvlib.read(args.reference)
targets, antitargets = reference.reference2regions(ref)
name = args.output or ref.sample_id
tabio.write(targets, name + '.target.bed', 'bed4')
tabio.write(antitargets, name + '.antitarget.bed', 'bed4')
コード例 #41
0
#!/usr/bin/env python
"""Sum of arm-level absolute log2 values.

Input: *.cns, with arm-level segmentation (segment -m none)
Output: table
"""
from __future__ import division, print_function
import sys
import cnvlib

for fname in sys.argv[1:]:
    cna = cnvlib.read(fname)
    stat = (cna.autosomes()['log2'].abs()**2).sum()
    print("%.2f" % stat, cna.sample_id, sep='\t')
コード例 #42
0
 def test_import_theta(self):
     """The 'import-theta' command."""
     cns = cnvlib.read("formats/nv3.cns")
     theta_fname = "formats/nv3.n3.results"
     for new_cns in commands.do_import_theta(cns, theta_fname):
         self.assertTrue(0 < len(new_cns) <= len(cns))
コード例 #43
0
 def setUp(self):
     self.ex_cnr = cnvlib.read(EX_CNR)
コード例 #44
0
 def setUp(self):
     self.ex_cnr = cnvlib.read('formats/reference-tr.cnn')
コード例 #45
0
This lets us skip recalculating GC and RepeatMasker values from the reference
genome sequence when creating another CNVkit reference.
Both CNVkit references must have the same number of rows (corresponding to the
same positions).
"""

import argparse
import sys

import cnvlib

AP = argparse.ArgumentParser(description=__doc__)
AP.add_argument("orig")
AP.add_argument("other")
AP.add_argument("-o",
                "--output",
                type=argparse.FileType('w'),
                default=sys.stdout)
args = AP.parse_args()

orig_arr = cnvlib.read(args.orig)
other_arr = cnvlib.read(args.other)
assert len(other_arr) == len(orig_arr)

other_arr["gc"] = orig_arr["gc"]
other_arr["rmask"] = orig_arr["rmask"]
other_arr.sort()
other_arr.sort_columns()
other_arr.write(args.output)
コード例 #46
0

def clipped_rolling_mean(values, window):
    clipped = values.clip(-3, 3)
    smoothed = clipped.rolling(window, min_periods=1, center=True).mean()
    return smoothed.values


def smooth_by_arm(cnarr, window):
    logr_chunks = [clipped_rolling_mean(cnarm['log2'], window)
                   for _chrom, cnarm in cnarr.by_arm()]
    d = cnarr.data.assign(log2=np.concatenate(logr_chunks))
    return cnarr.as_dataframe(d)


AP = argparse.ArgumentParser(description=__doc__)
AP.add_argument('cnr_fnames', nargs='+')
AP.add_argument('-w', '--window', type=int, default=100,
                help="Window size for smoothing.")
AP.add_argument('-d', '--output-dir', default='.')
args = AP.parse_args()

for fname in args.cnr_fnames:
    cnr = cnvlib.read(fname)
    cnr = smooth_by_arm(cnr, args.window)
    base, ext = os.path.basename(fname).rsplit(".", 1)
    outfname = "{}/{}.tsmooth{}.{}".format(args.output_dir, base,
                                           args.window, ext)
    tabio.write(cnr, outfname)
    print("Wrote", outfname, file=sys.stderr)
コード例 #47
0
def main(args):
    """."""
    # Load data
    cnarr = cnvlib.read(args.cnr_fname)
    # cnarr['weight'] = numpy.repeat(.78, len(cnarr))
    segarr = cnvlib.read(args.cns_fname)
    acgharr = cnvlib.read(args.cghr_fname)
    asegarr = cnvlib.read(args.cghs_fname)

    # Find the genomic location matching the specified gene(s)
    gene_names = args.gene_name.split(',')
    gene_coords = plots.gene_coords_by_name(cnarr, gene_names)
    if not len(gene_coords) == 1:
        raise ValueError("Genes %s are split across chromosomes %s"
                         % (args.gene_name, gene_coords.keys()))
    chrom, genes = gene_coords.popitem()
    genes.sort()
    # Set the display window to the selected genes +/- a margin
    window_coords = (genes[0][0] - args.window_width,
                     genes[-1][1] + args.window_width)

    # Use plot_chromosome to draw CNVkit and aCGH scatters
    cnv_sel_probes, cnv_sel_segs = get_plot_args(cnarr, segarr,
                                                 chrom, window_coords)
    acgh_sel_probes, acgh_sel_segs = get_plot_args(acgharr, asegarr,
                                                   chrom, window_coords)

    # Create a figure grid w/ 2 side-by-side axes
    _fig = pyplot.figure(figsize=(3.5 * len(genes), 3.5))
    axgrid = pyplot.GridSpec(1, 2, wspace=0)
    leftax = pyplot.subplot(axgrid[0])
    rightax = pyplot.subplot(axgrid[1], sharex=leftax, sharey=leftax)

    plots.cnv_on_chromosome(leftax, cnv_sel_probes, cnv_sel_segs, genes)
    plots.cnv_on_chromosome(rightax, acgh_sel_probes, acgh_sel_segs, genes)

    # Tweak aesthetics
    rightax.tick_params(labelleft=False, left=False)
    leftax.tick_params(labelleft=True)
    leftax.set_xlabel("Position (Mb)")
    rightax.set_ylabel('')
    rightax.set_title('')
    # Rotate & cull x-axis (position) labels
    if len(genes) == 1:
        xlabels = get_xtick_values(acgh_sel_probes)
        leftax.set_xticks(xlabels)
        leftax.set_xticklabels(map(str, xlabels), rotation=60)
        rightax.set_xticks(xlabels)
        rightax.set_xticklabels(map(str, xlabels), rotation=60)

    # Set sensible y-axis limits
    # all_y = numpy.concatenate((cnv_sel_probes.coverage,
    #                            acgh_sel_probes.coverage))
    # leftax.set_ylim(plots.limit(min(all_y) - .1, -5.0, -.3),
    #                 plots.limit(max(all_y) + .25, .3, 5.0))
    if args.gene_name == 'CDKN2A':
        all_y = numpy.concatenate((cnv_sel_segs.log2,
                                   acgh_sel_segs.log2))
        print("all_y:", tuple(all_y))
        leftax.set_ylim(limit(min(all_y) - .3, -5.0, -.5),
                        limit(max(all_y) + .3, .5, 5.0))
    else:
        leftax.set_ylim(-2.1, 1.1)

    # Save it.
    if args.output:
        pyplot.savefig(args.output, format='pdf', bbox_inches='tight')
        print("Wrote", args.output, file=sys.stderr)
    else:
        pyplot.show()
コード例 #48
0
 def test_breaks(self):
     """The 'breaks' command."""
     probes = cnvlib.read("formats/amplicon.cnr")
     segs = cnvlib.read("formats/amplicon.cns")
     rows = commands.do_breaks(probes, segs, 4)
     self.assertGreater(len(rows), 0)
コード例 #49
0
ファイル: cnv_updater.py プロジェクト: zitsen/cnvkit
                help="""CNVkit coverage files to update (*.targetcoverage.cnn,
                *.antitargetcoverage.cnn).""")
AP.add_argument("-d",
                "--output-dir",
                default=".",
                help="""Directory to write output .cnn files.""")
AP.add_argument(
    "-s",
    "--suffix",
    default=".updated",
    help="""Filename suffix to add before the '.cnn' extension in output
                files. [Default: %(default)s]""")
args = AP.parse_args()

for fname in args.cnn_files:
    cnarr = cnvlib.read(fname)
    # Convert coverage depths from log2 scale to absolute scale.
    # NB: The log2 values are un-centered in CNVkit v0.7.0(?) through v0.7.11;
    # earlier than that, the average 'depth' will be about 1.0.
    cnarr['depth'] = np.exp2(cnarr['log2'])
    # Rename "Background" bins to "Antitarget"
    # NB: The default off-target bin name was changed in CNVkit v0.9.0
    cnarr['gene'] = cnarr['gene'].replace("Background",
                                          cnvlib.params.ANTITARGET_NAME)
    cnarr.sort_columns()
    # Construct the output filename
    base, ext = os.path.basename(fname).rsplit('.', 1)
    if '.' in base:
        base, zone = base.rsplit('.', 1)
        out_fname = '.'.join((base + args.suffix, zone, ext))
    else: