def test_batch(self): """The 'batch' command.""" target_bed = "formats/my-targets.bed" fasta = "formats/chrM-Y-trunc.hg19.fa" bam = "formats/na12878-chrM-Y-trunc.bam" annot = "formats/my-refflat.bed" # Build a single-sample WGS reference ref_fname, tgt_bed_fname, _ = batch.batch_make_reference( [bam], None, None, True, fasta, annot, True, 500, None, None, None, None, 'build', 1, False, "wgs") self.assertEqual(ref_fname, 'build/reference.cnn') refarr = cnvlib.read(ref_fname, 'bed') tgt_regions = tabio.read(tgt_bed_fname, 'bed') self.assertEqual(len(refarr), len(tgt_regions)) # Build a single-sample hybrid-capture reference ref_fname, tgt_bed_fname, anti_bed_fname = batch.batch_make_reference( [bam], target_bed, None, True, fasta, None, True, 10, None, 1000, 100, None, 'build', 1, False, "hybrid") self.assertEqual(ref_fname, 'build/reference.cnn') refarr = cnvlib.read(ref_fname, 'bed') tgt_regions = tabio.read(tgt_bed_fname, 'bed') anti_regions = tabio.read(anti_bed_fname, 'bed') self.assertEqual(len(refarr), len(tgt_regions) + len(anti_regions)) # Run the same sample batch.batch_run_sample( bam, tgt_bed_fname, anti_bed_fname, ref_fname, 'build', True, True, True, None, False, False, "hybrid", 1) cns = cnvlib.read("build/na12878-chrM-Y-trunc.cns") self.assertGreater(len(cns), 0)
def test_read_vcf(self): """Read the VCF format.""" # Paired VCF with full info fname = "formats/na12878_na12882_mix.vcf" v1 = tabio.read(fname, "vcf") self.assertLess(len(v1), linecount(fname)) self.assertLess(0, len(v1)) for sid in ("NA12882", "NA12878"): v2 = tabio.read(fname, "vcf", sample_id=sid) self.assertEqual(v2.sample_id, sid) self.assertEqual(len(v1), len(v2)) for kwarg in ({'min_depth': 100}, {'skip_somatic': True}, {'skip_reject': True}): v3 = tabio.read(fname, "vcf", **kwarg) self.assertLess(len(v3), len(v1)) self.assertLess(0, len(v3), "%d variants left after filter %r" % (len(v3), list(kwarg)[0])) # VCF header, no samples, no records v4 = tabio.read('formats/nosample.vcf', 'vcf') self.assertEqual(len(v4), 0) self.assertEqual(v4.sample_id, 'nosample') # VCF with 1 sample, no records v5 = tabio.read('formats/blank.vcf', 'vcf', sample_id='Blank') self.assertEqual(len(v5), 0) self.assertEqual(v5.sample_id, 'Blank')
def test_autobin(self): """The 'autobin' command.""" bam_fname = "formats/na12878-chrM-Y-trunc.bam" target_bed = "formats/my-targets.bed" targets = tabio.read(target_bed, 'bed') access_bed = "../data/access-5k-mappable.hg19.bed" accessible = tabio.read(access_bed, 'bed').filter(chromosome='chrY') for method in ('amplicon', 'wgs', 'hybrid'): (cov, bs), _ = autobin.do_autobin(bam_fname, method, targets=targets, access=accessible) self.assertGreater(cov, 0) self.assertGreater(bs, 0)
def test_resize_ranges(self): """Test resizing bins.""" baits_fname = 'formats/nv2_baits.interval_list' chrom_sizes = { 'chr1': 249250621, 'chr2': 243199373, 'chr3': 198022430, 'chr4': 191154276, 'chr5': 180915260, 'chr6': 171115067, 'chr7': 159138663, 'chr8': 146364022, 'chr9': 141213431, 'chr10': 135534747, 'chr11': 135006516, 'chr12': 133851895, 'chr13': 115169878, 'chr14': 107349540, 'chr15': 102531392, 'chr16': 90354753, 'chr17': 81195210, 'chr18': 78077248, 'chr19': 59128983, 'chr20': 63025520, 'chr21': 48129895, 'chr22': 51304566, 'chrX': 155270560, 'chrY': 59373566 } bins = tabio.read(baits_fname, 'interval') for chrom, arr in bins.resize_ranges(1e7, chrom_sizes).by_chromosome(): self.assertLessEqual(0, arr.start.min()) self.assertLessEqual(arr.end.max(), chrom_sizes[chrom])
def test_read_vcf(self): """Read the VCF format.""" fname = "formats/na12878_na12882_mix.vcf" v1 = tabio.read(fname, "vcf") self.assertLess(len(v1), linecount(fname)) self.assertLess(0, len(v1)) for sid in ("NA12882", "NA12878"): v2 = tabio.read(fname, "vcf", sample_id=sid) self.assertEqual(v2.sample_id, sid) self.assertEqual(len(v1), len(v2)) for kwarg in ({ 'min_depth': 100 }, { 'skip_somatic': True }, { 'skip_reject': True }): v3 = tabio.read(fname, "vcf", **kwarg) self.assertLess(len(v3), len(v1)) self.assertLess(0, len(v3))
def test_segment(self): """The 'segment' command.""" cnarr = cnvlib.read("formats/amplicon.cnr") # NB: R methods are in another script; haar is pure-Python segments = segmentation.do_segmentation(cnarr, "haar") self.assertGreater(len(segments), 0) segments = segmentation.do_segmentation(cnarr, "haar", threshold=.0001, skip_low=True) self.assertGreater(len(segments), 0) varr = tabio.read("formats/na12878_na12882_mix.vcf", "vcf") segments = segmentation.do_segmentation(cnarr, "haar", variants=varr) self.assertGreater(len(segments), 0)
def test_call_filter(self): segments = cnvlib.read("formats/tr95t.segmetrics.cns") variants = tabio.read("formats/na12878_na12882_mix.vcf", "vcf") # Each filter individually, then all filters together for filters in (['ampdel'], ['cn'], ['ci'], ['sem'], ['sem', 'cn', 'ampdel'], ['ci', 'cn', 'ampdel']): result = commands.do_call(segments, variants, method="threshold", purity=.9, is_reference_male=True, is_sample_female=True, filters=filters) self.assertLessEqual(len(result), len(segments)) self.assertLessEqual(len(segments.chromosome.unique()), len(result)) for colname in 'baf', 'cn', 'cn1', 'cn2': self.assertIn(colname, result)
def test_empty(self): """Instantiate from an empty file.""" garr = tabio.read("formats/empty") self.assertEqual(len(garr), 0)
AP = argparse.ArgumentParser(description=__doc__) AP.add_argument('refflat', help="UCSC refFlat.txt for the reference genome.") AP.add_argument('-e', '--exons', action='store_true', help="""Emit each exon instead of the whole gene regions.""") AP.add_argument('-f', '--flatten', action='store_true', help="""Flatten overlapping regions, keeping original boundaries. Not recommended with --exons.""") AP.add_argument('-m', '--merge', metavar='BASEPAIRS', nargs='?', type=int, const=1, help="""Merge overlapping regions with different names. Recommended with --exons. Optional argument value is the number of overlapping bases between two regions to trigger a merge. [Default: %(const)s]""") AP.add_argument('-o', '--output', help="Output filename.") args = AP.parse_args() regions = tabio.read(args.refflat, 'refflat', exons=args.exons) if args.flatten: regions = regions.flatten() elif args.merge: regions = regions.merge(bp=args.merge) tabio.write(regions, args.output, 'bed4')
def test_read_text(self): """Read the text region format.""" fname = "formats/amplicon.text" regions = tabio.read(fname, "text") self.assertEqual(len(regions), linecount(fname)) self.assertEqual(regions.sample_id, "amplicon")
def test_read_picardhs(self): """Read Picard CalculateHsMetrics PER_TARGET_COVERAGE format.""" fname = "picard/p2-5_5.antitargetcoverage.csv" cna = tabio.read(fname, "picardhs") self.assertEqual(len(cna), linecount(fname) - 1) self.assertEqual(cna.sample_id, "p2-5_5")
def test_read_ilist(self): """Read the interval list format.""" regions = tabio.read("formats/nv2_baits.interval_list", "interval") self.assertEqual(len(regions), 6809) self.assertEqual(regions.sample_id, "nv2_baits")
def test_read_bed(self): """Read the BED format.""" fname = "formats/amplicon.bed" regions = tabio.read(fname, "bed") self.assertEqual(len(regions), linecount(fname)) self.assertEqual(regions.sample_id, "amplicon")
def test_empty(self): """Instantiate from an empty file.""" for fmt in ("auto", "tab", "bed", "interval", "text"): regions = tabio.read("formats/empty", fmt=fmt) self.assertEqual(len(regions), 0)
def test_read_refflat(self): """Read the UCSC 'refFlat' format.""" fname = "formats/refflat-mini.txt" regions = tabio.read(fname, 'refflat') self.assertEqual(len(regions), linecount(fname)) self.assertEqual(13, regions.chromosome.nunique())
def test_read_gff(self): """Read the GFF format.""" fname = 'formats/example.gff' regions = tabio.read(fname, 'gff') self.assertEqual(len(regions), linecount(fname) - 2) self.assertEqual(regions.sample_id, 'example')