Esempio n. 1
0
 def setUp(self):
     sample_counts = tests.HashableDict(YRI=10, CHB=20, CEU=30, Papuan=40)
     ts, model = tests.basic_sim(sample_counts)
     _, pop_indices = convert.ts_pop_counts_indices(ts)
     rng = np.random.default_rng(seed=31415)
     maf_thres = 0.05
     num_rows = 32
     num_inds = ts.num_samples
     with tempfile.TemporaryDirectory() as tmpdir:
         ts_file = f"{tmpdir}/foo.trees"
         ts.dump(ts_file)
         A, _ = convert.ts_genotype_matrix(
             ts_file,
             pop_indices=pop_indices,
             ref_pop=0,
             num_rows=num_rows,
             num_haplotypes=num_inds,
             maf_thres=maf_thres,
             rng=rng,
             phased=True,
             ploidy=2,
         )
     self.assertEqual(A.shape, (num_rows, num_inds))
     self.A = A[np.newaxis, :, :, np.newaxis]
     self.pop_indices = pop_indices
Esempio n. 2
0
    def test_ts_genotype_matrix(self):
        num_haplotypes = sum(self.sample_counts.values())
        ts, _ = tests.basic_sim(self.sample_counts)
        maf_thres = 0.05
        rng = np.random.default_rng(seed=31415)
        for num_rows in (32, 64, 128):
            A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng)
            self.assertEqual(A.shape, (num_rows, num_haplotypes))

        # Check that MAF filtering works. To make this easier, we
        # set num_rows to the sequence length so there's no resizing.
        num_rows = int(ts.sequence_length)
        num_haplotypes = sum(self.sample_counts.values())
        for maf_thres in (0, 0.01, 0.1):
            ac_thres = maf_thres * num_haplotypes
            A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng)
            self.assertEqual(A.shape, (num_rows, num_haplotypes))
            positions = [
                # List of MAF filtered positions.
                int(v.site.position) for v in ts.variants()
                if sum(v.genotypes) >= ac_thres and num_haplotypes -
                sum(v.genotypes) >= ac_thres
            ]
            assert len(positions) > 0
            pset = set(positions)
            p_complement = [pos for pos in range(num_rows) if pos not in pset]
            assert len(p_complement) > 0
            # check allele counts for seg sites and non-seg sites
            ac_vec = np.sum(A, axis=1)
            self.assertTrue(all(ac_vec[positions] > 0))
            self.assertTrue(all(ac_vec[p_complement] == 0))
            # check MAF filtering worked
            af_vec = ac_vec[positions] / num_haplotypes
            self.assertTrue(all(af_vec >= maf_thres))
            self.assertTrue(all(af_vec <= 1 - maf_thres))
Esempio n. 3
0
    def test_reorder(self):
        rng = np.random.default_rng(seed=31415)
        maf_thres = 0.05
        num_rows = 32
        ts, model = tests.basic_sim(self.sample_counts)
        A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng)

        counts, indices = convert.ts_pop_counts_indices(ts)
        pop_id = {pop.id: j for j, pop in enumerate(model.populations)}
        ref_pop = pop_id["YRI"]
        j, k = indices[ref_pop], indices[ref_pop] + counts[ref_pop]
        c = np.empty((num_rows, 1))
        for i in range(num_rows):
            c[i, 0] = np.mean(A[i, j:k])

        new_indices = tests.reorder_indices(model, self.sample_counts)
        # Check that per-population submatrices are each sorted.
        # We check with the original population indices, then reorder
        # populations according to new_indices and check again.
        assert list(indices.keys()) != list(new_indices.keys())
        for dest_indices in (indices, new_indices):
            B = convert.reorder_and_sort(A, counts, indices, dest_indices,
                                         ref_pop)
            offset = 0
            for _id in dest_indices.keys():
                j, k = offset, offset + counts[_id]
                offset = k
                self.verify_sorted(B[:, j:k], c)
Esempio n. 4
0
 def test_compare_ts_vcf_genotype_matrixes(self):
     # Compare ts genotype matrix to vcf genotype matrix from ts.write_vcf().
     ts, _ = tests.basic_sim(self.sample_counts)
     for maf_thres in (0, 0.01, 0.1):
         num_haplotypes = sum(self.sample_counts.values())
         ac_thres = maf_thres * num_haplotypes
         positions = [
             # List of MAF filtered positions.
             v.site.position for v in ts.variants()
             if sum(v.genotypes) >= ac_thres and num_haplotypes -
             sum(v.genotypes) >= ac_thres
         ]
         individual_names = [f"ind{j}" for j in range(num_haplotypes // 2)]
         # Mock out random variables to ensure we get consistent behaviour
         # between ts and vcf versions.
         rng = mock.MagicMock()
         rng.random = mock.MagicMock(return_value=0.0)
         with tempfile.TemporaryDirectory() as tmpdir:
             vcf_file = tmpdir + "/ts.vcf"
             samples_file = tmpdir + "/samples.txt"
             with open(vcf_file, "w") as f:
                 ts.write_vcf(
                     f,
                     ploidy=2,
                     individual_names=individual_names,
                     position_transform=np.round,
                 )
             with open(samples_file, "w") as f:
                 print(*individual_names, file=f, sep="\n")
             subprocess.run(["bgzip", vcf_file])
             vcf_file += ".gz"
             subprocess.run(["bcftools", "index", vcf_file])
             winsize = int(ts.sequence_length)
             _, _, _, V, vcf_pos = next(
                 vcf.accumulate_matrices(
                     vcf_file,
                     vcf_pop_intervals=[(0, num_haplotypes)],
                     winsize=winsize,
                     winstep=winsize,
                     samples_file=samples_file,
                     maf_thres=maf_thres,
                     rng=rng,
                 ))
         np.testing.assert_array_equal(vcf_pos, np.round(positions))
         for num_rows in (32, 64, 128):
             A, _ = convert.ts2mat(ts, num_rows, maf_thres, rng)
             self.assertEqual(A.shape, (num_rows, num_haplotypes))
             # Use the float `positions` vector to resize here, not the integer
             # `vcf_pos` vector, to ensure resizing equivalence.
             B = vcf.resize(positions, V, winsize, num_rows)
             self.assertEqual(A.shape, B.shape)
             self.assertEqual(A.dtype, B.dtype)
             np.testing.assert_array_equal(A, B)
Esempio n. 5
0
    def test_verify_partition(self):
        convert.verify_partition([0], [100], 100)
        convert.verify_partition([0, 10, 20, 30], [10, 10, 10, 10], 40)
        with self.assertRaises(ValueError):
            convert.verify_partition([0], [100], 200)
            convert.verify_partition([0], [200], 100)
            convert.verify_partition([0, 10], [20, 10], 30)
            convert.verify_partition([0, 100], [10, 10], 20)

        ts, model = tests.basic_sim(self.sample_counts)
        counts, indices = convert.ts_pop_counts_indices(ts)
        convert.verify_partition(indices.values(), counts.values(),
                                 sum(counts.values()))
Esempio n. 6
0
 def test_ts_pop_counts_indices(self):
     ts, model = tests.basic_sim(self.sample_counts)
     counts, indices = convert.ts_pop_counts_indices(ts)
     self.assertEqual(len(indices), len(counts))
     self.assertEqual(indices.keys(), counts.keys())
     self.assertEqual(len(counts), len(self.sample_counts))
     pop_id = {pop.id: j for j, pop in enumerate(model.populations)}
     for pop, count in self.sample_counts.items():
         j = pop_id[pop]
         self.assertEqual(counts[j], count)
     offset = 0
     for j, index in indices.items():
         self.assertEqual(index, offset)
         offset += counts[j]