def test_targets(self): bam_bc_file = tk_test.in_path("namesort_test.bam") read_info_out = tk_test.out_path("read_info.h5") barcode_whitelist = bc_utils.load_barcode_whitelist("737K-april-2014") targets_filename = tk_test.in_path('agilent_kinome_targs.bed') targets_file = open(targets_filename, 'r') target_regions = tk_io.get_target_regions(targets_file) bam_in = tk_bam.create_bam_infile(bam_bc_file) r = compute_basic_stats(bam_in, target_regions, 1000, bam_in.references, barcode_whitelist=barcode_whitelist, read_h5_out=read_info_out) # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r misc_sm, bc_sms = r nearest_targ_dists = bc_sms.get('nearest_targ_dists') maxTargetDist = max(nearest_targ_dists.get_summarizer(60).dict.keys()) minTargetDist = min(nearest_targ_dists.get_summarizer(60).dict.keys()) self.assertEqual(minTargetDist, 130) self.assertEqual(maxTargetDist, 10000)
def test_barcode_counts(self): bam_bc_file = tk_test.in_path("attach_bcs/attach_bcs_output.bam") read_info_out = tk_test.out_path("read_info.h5") barcode_whitelist = bc_utils.load_barcode_whitelist("737K-april-2014") bam_in = tk_bam.create_bam_infile(bam_bc_file) r = compute_basic_stats(bam_in, {}, 2000, bam_in.references, barcode_whitelist=barcode_whitelist, read_h5_out=read_info_out) # insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = r misc_sm, bc_sms = r # Look at the barcode results -- there should be a raw bc count for each read pair # n_raw_bcs = bc_table["count"].sum() n_reads = len([x for x in tk_bam.create_bam_infile(bam_bc_file)]) # self.assertEqual(n_raw_bcs, n_reads / 2) # Load the per-cluster table -- there should be a row for each read pair read_info = tenkit.hdf5.read_data_frame(read_info_out) self.assertEqual(read_info.shape[0], n_reads / 2)
def test_target_finding(self): # Check a few targets by hand targets_filename = tk_test.in_path('agilent_kinome_targs.bed') targets_file = open(targets_filename, 'r') target_regions = tk_io.get_target_regions(targets_file) chr1_regions = target_regions['chr1'] chr1_list = chr1_regions.get_region_list() test_reg = chr1_list[0] dist = get_read_regions_dist(0, test_reg.start - 10, chr1_regions) self.assertEqual(dist, 10) dist = get_read_regions_dist(0, test_reg.start + 10, chr1_regions) self.assertEqual(dist, 0) dist = get_read_regions_dist(test_reg.end + 2, test_reg.end + 3, chr1_regions) self.assertEqual(dist, 2) dist = get_read_regions_dist(test_reg.end - 2, test_reg.end + 3, chr1_regions) self.assertEqual(dist, 0)
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # # Code for testing analyze_run.py # from pyfasta import Fasta import tenkit.bam as tk_bam import tenkit.test as tk_test from tenkit.regions import Regions from .. import * ## Input files bam_in_file = tk_test.in_path('test_analyze_bias.bam') fasta_dir = tk_test.in_path('fasta/') class TestFunctions(tk_test.UnitTestBase): def setUp(self): self.bam_in = tk_bam.create_bam_infile(bam_in_file) def test_get_depth_info(self): ref_fasta = Fasta(fasta_dir + 'test/chr0.fa') chr0 = ref_fasta['chr0'] confident_regions = Regions([(0,10000000)]) reads = list(self.bam_in) r = get_depth_info(reads, "chr0", 0, len(chr0), None, confident_regions) (depth_df, summary_depth_info, confident_depth_info, target_info, target_cov) = r reads_dd = filter(lambda x: not x.is_duplicate, reads) r_dd = get_depth_info(reads_dd, "chr0", 0, len(chr0), None, confident_regions)
from .. import * import crdna.bio_io as crdna_io import pysam import itertools import numpy as np import tenkit.constants import crdna.read_filter import martian martian.test_initialize("") # Patrick Marks # Simple test of deduper IN_BAM = tk_test.in_path('test_analyze_bias.bam') OUT_BAM = tk_test.out_path('test_dedup_out.bam') OUT_JSON = tk_test.out_path( 'test_dedup_summary.json') IN_BAM_BIG = tk_test.in_path("test_mark_duplicates.bam") class TestFunctions(tk_test.UnitTestBase): def setUp(self): pass def test_dedup(self): tenkit.constants.DUPLICATE_SUBSAMPLE_COVERAGES = [0.00001, 0.0001] args = martian.Record({ 'input': IN_BAM, 'estimated_coverage': 100.0, 'perfect_read_count': 1000, 'chunk_start':None, 'chunk_end':None }) outs = martian.Record({ 'output': OUT_BAM, 'duplicate_summary': OUT_JSON }) main_mark_duplicates(args, outs) out_bam = pysam.Samfile(OUT_BAM)
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # # Test attach bcs import pysam from itertools import groupby import tenkit.test as tk_test import tenkit.seq as tk_seq from .. import * from tenkit.constants import PROCESSED_BARCODE_TAG, RAW_BARCODE_TAG, SAMPLE_INDEX_TAG import martian IN_BAM = tk_test.in_path('attach_bcs/alignment_with_secondary.bam') IN_I1 = tk_test.in_path('attach_bcs/phix_I1.fastq') IN_I2 = tk_test.in_path('attach_bcs/phix_I2.fastq') IN_WHITELIST = '737K-april-2014' OUT_BAM = tk_test.out_path('test_attach_bcs.bam') class TestFunctions(tk_test.UnitTestBase): def setUp(self): pass def test_attach_bcs(self): # --align_input alignment_output.bam --barcode_input phix_I2.fastq --output test2.out --complete ~/c --stats ~/s args = { 'barcode_whitelist' : IN_WHITELIST, 'align_chunk' : IN_BAM, 'barcode_chunk' : IN_I2,
def setUp(self): contig_file = tk_test.in_path("phasing/big_fragments.h5") snp_vfr = VariantFileReader(tk_test.in_path("phasing/default.vcf.gz")) self.p = Phaser(snp_vfr, contig_file, 'chr21', 10000000, 10500000)
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # # Code for testing phaser.py # import scipy.sparse import math import tenkit.test as tk_test import tenkit.bio_io as tk_io from tenkit.bio_io import VariantFileReader, VariantFileWriter from ..phaser import * CONTIG_FILE = tk_test.in_path('phasing/small_fragments.h5') SNP_INPUT_VCF = tk_test.in_path('test_phasing_snps_sorted.vcf.gz') INDEL_INPUT_VCF = tk_test.in_path('test_phasing_indels_sorted.vcf.gz') OUTPUT_VCF = tk_test.out_path('test_phasing_output.vcf') OUTPUT_TSV = tk_test.out_path("test_bc_hap_out.tsv") import martian martian.test_initialize(tk_test.out_path("")) class TestPhaserBig(tk_test.UnitTestBase): def setUp(self): contig_file = tk_test.in_path("phasing/big_fragments.h5") snp_vfr = VariantFileReader(tk_test.in_path("phasing/default.vcf.gz")) self.p = Phaser(snp_vfr, contig_file, 'chr21', 10000000, 10500000) def test_calc_hap1_hap2_log_prob(self):
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # import tenkit.test as tk_test from .. import * import pysam import martian martian.test_initialize("") # Patrick Marks # Simple test of deduper IN_FASTQ = tk_test.in_path('test_bc_sorted_big_fastq.fastq.gz') OUT_BAM = tk_test.out_path('test_unaligned_out.bam') class TestFunctions(tk_test.UnitTestBase): def setUp(self): pass def test_make_unaligned(self): args = martian.Record({ 'sample_id': 1234, 'output_format': "bam", 'read_group': "RG", 'read_chunk': IN_FASTQ }) outs = martian.Record({'barcoded_unaligned': OUT_BAM}) main(args, outs)
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # # Test attach bcs import pysam import tenkit.test as tk_test import tenkit.bio_io as tk_io from .. import * import martian IN_BAM = tk_test.in_path("attach_phasing/bam_test.bam") IN_FRAGS = tk_test.in_path("attach_phasing/fragments_test.tsv.gz") OUT_BAM = tk_test.out_path("test_attach_phasing.bam") class TestFunctions(tk_test.UnitTestBase): def setUp(self): pass def test_attach_phasing(self): args = martian.Record({ 'input': IN_BAM, 'fragment_phasing': IN_FRAGS, 'chunk_start': 0, 'chunk_end': 1 << 32 }) outs = martian.Record({ 'phased_possorted_bam': OUT_BAM,
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # from .. import * import tenkit.test as tk_test import tenkit.seq as tk_seq import tenkit.hdf5 import barcodes.utils as bc_utils bam_in_file = tk_test.in_path('test_map_rate.bam') class TestFunctions(tk_test.UnitTestBase): def setUp(self): self.bam_in = tk_bam.create_bam_infile(bam_in_file) def _test_summary_metrics(self): read_info_out = tk_test.out_path("read_info.h5") insert_size_dists, nearest_targ_dists, summary_metrics, bc_table, mapq_counts, insert_size_hist = \ compute_basic_stats(self.bam_in, {}, read_h5_out=read_info_out) summary = summary_metrics self.assertEqual(summary["mapped_bases"], 6) self.assertEqual(summary["mean_dup_rate"], 1.0) self.assertEqual(summary["num_reads"], 6) self.assertEqual(summary["total_bases"], 12) p = tenkit.hdf5.read_data_frame(read_info_out)
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # import os.path import tenkit.test as tk_test from .. import * IN_BAM = tk_test.in_path('test_attach_bc_vars.bam') OUT_VCF = tk_test.out_path('test_call_variants.vcf') class TestFunctions(tk_test.UnitTestBase): def setUp(self): pass def test_call_variants(self): test_locus = "chr1:10000..20000" args = martian.Record({ 'input': IN_BAM, 'locus': test_locus, 'reference_path': 'hg19', 'targets_file': None, 'restrict_locus': None, 'coverage': None, 'max_coverage': None, 'variant_mode': 'freebayes' }) outs = martian.Record({'default': OUT_VCF}) main(args, outs) self.assertTrue(os.path.exists(OUT_VCF))
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # import scipy.sparse import math import tenkit.test as tk_test from .. import * import martian TEST_SPLIT_BAM = tk_test.in_path('test_split.bam') martian.test_initialize(tk_test.out_path("")) class TestGetReadpairLoci(tk_test.UnitTestBase): def setUp(test): pass def test_get_discordant_loci(self): # Reads 3,4,7,11 are neither secondary nor read1 so they should never be considered. # Reads 5, 8, 9, 10 are split # Reads 1, 2, 6 are rp and read1 loci = get_discordant_loci(TEST_SPLIT_BAM, min_insert=0, max_insert=300, min_sv_len=300) # Only reads 5,6,8,9,10 are included (1-based read indices) self.assertEqual(len(loci), 10) self.assertEqual(loci[0], ('chr20', 60173, 60773, (0, 1)))
def setUp(self): super(TestLaneCount, self).setUp() self.input_dir = tk_test.in_path("lane") self.output_dir = tk_test.out_path("prepare_samplesheet") os.makedirs(self.output_dir)
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # # Test attach bcs import pysam import tenkit.test as tk_test import tenkit.fasta as tk_fasta from .. import * import martian IN_FASTQ = tk_test.in_path("test_bwa.fastq") OUT_BAM = tk_test.out_path("test_aligner.bam") class TestFunctions(tk_test.UnitTestBase): def setUp(self): pass def test_align(self): args = { 'chunk_input': IN_FASTQ, 'aligner': 'bwa', 'aligner_method': 'MEM', 'reference_path': 'hg19', '__threads': 1, 'reads_interleaved': True } outs = {'default': OUT_BAM}
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # import os import tenkit.test as tk_test import tenkit.fasta as tk_fasta import martian from .. import * martian.test_initialize(tk_test.out_path("")) IN_PREFIX = tk_test.in_path('combine_and_trim_reads') IN_RA_ALL = [ tk_test.in_path( 'combine_and_trim_reads/read-RA_si-AAAA_lane-1_chunk-1.fastq'), tk_test.in_path( 'combine_and_trim_reads/read-RA_si-AAAN_lane-1_chunk-1.fastq'), tk_test.in_path( 'combine_and_trim_reads/read-RA_si-CCCC_lane-1_chunk-1.fastq') ] class TestFunctions(tk_test.UnitTestBase): def test_setup_chunks(self): args = martian.Record({ 'input_mode': 'BCL_PROCESSOR', 'sample_def': [{ 'read_path': IN_PREFIX, 'sample_indices': ["AAAA", "CCCC"],
# # Copyright (c) 2014 10X Genomics, Inc. All rights reserved. # import scipy.sparse import math import tenkit.test as tk_test import tenkit.bio_io as tk_io from .. import * import martian TEST_BAM = tk_test.in_path('test_count_reads.bam') TEST_TARGETS = tk_test.in_path('test_targets_for_counting.txt') martian.test_initialize(tk_test.out_path("")) class TestCountReadsBcs(tk_test.UnitTestBase): def setUp(test): pass def test_get_non_overlapping_wins(self): starts = np.arange(0, 12, 2) stops = np.arange(2, 14, 2) sel = get_non_overlapping_wins(starts, stops) assert(np.all(sel == np.arange(6))) starts = np.arange(6) stops = np.arange(2, 8) sel = get_non_overlapping_wins(starts, stops) assert(list(sel) == [0, 2, 4])