def setUp(self): sample_counts = tests.HashableDict(YRI=10, CHB=20, CEU=30, Papuan=40) ts, model = tests.basic_sim(sample_counts) _, pop_indices = convert.ts_pop_counts_indices(ts) rng = np.random.default_rng(seed=31415) maf_thres = 0.05 num_rows = 32 num_inds = ts.num_samples with tempfile.TemporaryDirectory() as tmpdir: ts_file = f"{tmpdir}/foo.trees" ts.dump(ts_file) A, _ = convert.ts_genotype_matrix( ts_file, pop_indices=pop_indices, ref_pop=0, num_rows=num_rows, num_haplotypes=num_inds, maf_thres=maf_thres, rng=rng, phased=True, ploidy=2, ) self.assertEqual(A.shape, (num_rows, num_inds)) self.A = A[np.newaxis, :, :, np.newaxis] self.pop_indices = pop_indices
def test_ts_genotype_matrix(self): num_haplotypes = sum(self.sample_counts.values()) ts, _ = tests.basic_sim(self.sample_counts) maf_thres = 0.05 rng = np.random.default_rng(seed=31415) for num_rows in (32, 64, 128): A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng) self.assertEqual(A.shape, (num_rows, num_haplotypes)) # Check that MAF filtering works. To make this easier, we # set num_rows to the sequence length so there's no resizing. num_rows = int(ts.sequence_length) num_haplotypes = sum(self.sample_counts.values()) for maf_thres in (0, 0.01, 0.1): ac_thres = maf_thres * num_haplotypes A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng) self.assertEqual(A.shape, (num_rows, num_haplotypes)) positions = [ # List of MAF filtered positions. int(v.site.position) for v in ts.variants() if sum(v.genotypes) >= ac_thres and num_haplotypes - sum(v.genotypes) >= ac_thres ] assert len(positions) > 0 pset = set(positions) p_complement = [pos for pos in range(num_rows) if pos not in pset] assert len(p_complement) > 0 # check allele counts for seg sites and non-seg sites ac_vec = np.sum(A, axis=1) self.assertTrue(all(ac_vec[positions] > 0)) self.assertTrue(all(ac_vec[p_complement] == 0)) # check MAF filtering worked af_vec = ac_vec[positions] / num_haplotypes self.assertTrue(all(af_vec >= maf_thres)) self.assertTrue(all(af_vec <= 1 - maf_thres))
def test_reorder(self): rng = np.random.default_rng(seed=31415) maf_thres = 0.05 num_rows = 32 ts, model = tests.basic_sim(self.sample_counts) A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng) counts, indices = convert.ts_pop_counts_indices(ts) pop_id = {pop.id: j for j, pop in enumerate(model.populations)} ref_pop = pop_id["YRI"] j, k = indices[ref_pop], indices[ref_pop] + counts[ref_pop] c = np.empty((num_rows, 1)) for i in range(num_rows): c[i, 0] = np.mean(A[i, j:k]) new_indices = tests.reorder_indices(model, self.sample_counts) # Check that per-population submatrices are each sorted. # We check with the original population indices, then reorder # populations according to new_indices and check again. assert list(indices.keys()) != list(new_indices.keys()) for dest_indices in (indices, new_indices): B = convert.reorder_and_sort(A, counts, indices, dest_indices, ref_pop) offset = 0 for _id in dest_indices.keys(): j, k = offset, offset + counts[_id] offset = k self.verify_sorted(B[:, j:k], c)
def test_compare_ts_vcf_genotype_matrixes(self): # Compare ts genotype matrix to vcf genotype matrix from ts.write_vcf(). ts, _ = tests.basic_sim(self.sample_counts) for maf_thres in (0, 0.01, 0.1): num_haplotypes = sum(self.sample_counts.values()) ac_thres = maf_thres * num_haplotypes positions = [ # List of MAF filtered positions. v.site.position for v in ts.variants() if sum(v.genotypes) >= ac_thres and num_haplotypes - sum(v.genotypes) >= ac_thres ] individual_names = [f"ind{j}" for j in range(num_haplotypes // 2)] # Mock out random variables to ensure we get consistent behaviour # between ts and vcf versions. rng = mock.MagicMock() rng.random = mock.MagicMock(return_value=0.0) with tempfile.TemporaryDirectory() as tmpdir: vcf_file = tmpdir + "/ts.vcf" samples_file = tmpdir + "/samples.txt" with open(vcf_file, "w") as f: ts.write_vcf( f, ploidy=2, individual_names=individual_names, position_transform=np.round, ) with open(samples_file, "w") as f: print(*individual_names, file=f, sep="\n") subprocess.run(["bgzip", vcf_file]) vcf_file += ".gz" subprocess.run(["bcftools", "index", vcf_file]) winsize = int(ts.sequence_length) _, _, _, V, vcf_pos = next( vcf.accumulate_matrices( vcf_file, vcf_pop_intervals=[(0, num_haplotypes)], winsize=winsize, winstep=winsize, samples_file=samples_file, maf_thres=maf_thres, rng=rng, )) np.testing.assert_array_equal(vcf_pos, np.round(positions)) for num_rows in (32, 64, 128): A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng) self.assertEqual(A.shape, (num_rows, num_haplotypes)) # Use the float `positions` vector to resize here, not the integer # `vcf_pos` vector, to ensure resizing equivalence. B = vcf.resize(positions, V, winsize, num_rows) self.assertEqual(A.shape, B.shape) self.assertEqual(A.dtype, B.dtype) np.testing.assert_array_equal(A, B)
def test_verify_partition(self): convert.verify_partition([0], [100], 100) convert.verify_partition([0, 10, 20, 30], [10, 10, 10, 10], 40) with self.assertRaises(ValueError): convert.verify_partition([0], [100], 200) convert.verify_partition([0], [200], 100) convert.verify_partition([0, 10], [20, 10], 30) convert.verify_partition([0, 100], [10, 10], 20) ts, model = tests.basic_sim(self.sample_counts) counts, indices = convert.ts_pop_counts_indices(ts) convert.verify_partition(indices.values(), counts.values(), sum(counts.values()))
def test_ts_pop_counts_indices(self): ts, model = tests.basic_sim(self.sample_counts) counts, indices = convert.ts_pop_counts_indices(ts) self.assertEqual(len(indices), len(counts)) self.assertEqual(indices.keys(), counts.keys()) self.assertEqual(len(counts), len(self.sample_counts)) pop_id = {pop.id: j for j, pop in enumerate(model.populations)} for pop, count in self.sample_counts.items(): j = pop_id[pop] self.assertEqual(counts[j], count) offset = 0 for j, index in indices.items(): self.assertEqual(index, offset) offset += counts[j]