Exemple #1
0
    def test_empty_pop(self):
        missing = (-1, -1)
        gts = [
            [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0],
               [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0],
               [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        variations = Variations()
        variations.samples = da.from_array(np.array(samples))
        variations[GT_FIELD] = da.from_array(np.array(gts))
        variations[DP_FIELD] = da.from_array(np.array(dps))

        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=0)
        assert np.allclose(dists, [0.65490196])

        gts = [
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        dps = [[0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0],
               [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0],
               [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        variations = Variations()
        variations.samples = da.from_array(np.array(samples))
        variations[GT_FIELD] = da.from_array(np.array(gts))
        variations[DP_FIELD] = da.from_array(np.array(dps))
        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=0)
        assert np.isnan(dists[0])
Exemple #2
0
    def test_calc_missing_memory(self):
        variations = Variations()
        gts = np.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0],
                                                               [-1, -1]],
                        [[-1, -1], [-1, -1]]])
        samples = [str(i) for i in range(gts.shape[1])]
        variations.samples = np.array(samples)
        variations[GT_FIELD] = gts

        result = calc_missing_gt(variations, rates=False)

        expected = np.array([2, 1, 1, 0])
        assert np.all(result == 2 - expected)

        gts = np.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, -1]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]],
                        [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                        [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        samples = [str(i) for i in range(gts.shape[1])]
        variations = Variations()
        variations.samples = np.array(samples)
        variations[GT_FIELD] = gts
        result = calc_missing_gt(variations, rates=False)
        #         result = compute(task)
        expected = np.array([0.5, 1, 2, 4])
        assert np.all(result == expected)
Exemple #3
0
    def test_kosman_missing_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_ab = _kosman(vars1, vars2)

        c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1],
                      [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                      [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_cd = _kosman(vars1, vars2)

        assert np.all(distance_ab == distance_cd)
Exemple #4
0
    def test_nei_dist_in_memory(self):

        gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                        [[1, 1], [1, 2], [2, 2], [2, 1]],
                        [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations = Variations()
        variations.samples = np.array([1, 2, 3, 4])
        variations[GT_FIELD] = gts

        pops = [[1, 2], [3, 4]]
        dists = calc_pop_pairwise_unbiased_nei_dists(
            variations,
            max_alleles=6,
            populations=pops,
            silence_runtime_warnings=True,
            min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        # all missing
        gts = np.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations = Variations()
        variations.samples = np.array([1, 2, 3, 4])
        variations[GT_FIELD] = gts

        pops = [[1, 2], [3, 4]]
        dists = calc_pop_pairwise_unbiased_nei_dists(
            variations,
            max_alleles=1,
            populations=pops,
            silence_runtime_warnings=True,
            min_num_genotypes=1)
        assert math.isnan(dists[0])

        # min_num_genotypes
        gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                        [[1, 1], [1, 2], [2, 2], [2, 1]],
                        [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])

        variations = Variations()
        variations.samples = np.array([1, 2, 3, 4])
        variations[GT_FIELD] = gts
        pops = [[1, 2], [3, 4]]
        dists = calc_pop_pairwise_unbiased_nei_dists(
            variations,
            max_alleles=6,
            populations=pops,
            silence_runtime_warnings=True,
            min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)
Exemple #5
0
    def test_kosman_2_indis(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]

        assert distance == 1 / 3

        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0

        variations = Variations()
        gts = np.stack((b, d), axis=1)
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0.45
Exemple #6
0
    def test_dest_jost_distance_in_memory(self):
        gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)],
               [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]
        dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20],
               [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]]
        variations = Variations()
        variations.samples = np.array(samples)
        variations[GT_FIELD] = np.array(gts)
        variations[DP_FIELD] = np.array(dps)

        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=0)
        assert np.allclose(dists, [0.65490196])

        dists = calc_dset_pop_distance(variations,
                                       max_alleles=5,
                                       silence_runtime_warnings=True,
                                       populations=pops,
                                       min_num_genotypes=6)
        assert np.all(np.isnan(dists))
Exemple #7
0
    def test_kosman_2_indis_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()

        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_ab = _kosman(vars1, vars2)

        va.make_sure_array_is_in_memory(distance_ab)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 1 / 3

        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        distance_ab = _kosman(vars1, vars2)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0

        variations = Variations()
        gts = np.stack((b, d), axis=1)
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        distance_ab = _kosman(vars1, vars2)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0.45
    def test_unavailable_shape(self):
        variations = Variations()
        variations.samples = ['1', '2', '3']
        gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
        variations[GT_FIELD] = gts
        assert variations.num_variations == 3

        variations = _create_empty_dask_variations()
        try:
            variations.num_variations
            self.fail('NotMaterializedError expected')
        except NotMaterializedError:
            pass
Exemple #9
0
    def xtest_do_pca(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        do_pca(variations)

        gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]],
                        [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]])
        variations = Variations()
        variations.samples = da.from_array(np.array(['a', 'b', 'c']))
        variations[GT_FIELD] = da.from_array(gts)

        res = do_pca(variations)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert np.allclose(projs[0], projs[1])
        assert not np.allclose(projs[0], projs[2])
Exemple #10
0
    def test_kosman_missing(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)

        c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1],
                      [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                      [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_cd = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)

        assert np.all(distance_ab == distance_cd)
    def test_iterate_chunks(self):
        # in memory
        variations = Variations()
        variations.samples = ['1', '2', '3']
        gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
        variations[GT_FIELD] = gts
        for index, chunk in enumerate(variations.iterate_chunks(chunk_size=1)):
            assert np.all(chunk[GT_FIELD] == variations[GT_FIELD][index, :])
            assert np.all(chunk.samples == variations.samples)

        # in disk
        variations = load_zarr((TEST_DATA_DIR / 'test.zarr'),
                               num_vars_per_chunk=1)
        chunks = list(variations.iterate_chunks())
        self.assertEqual(len(chunks), 7)
    def test_basic_operations(self):
        variations = Variations()
        self.assertEqual(variations.num_variations, 0)
        self.assertEqual(variations.num_samples, 0)

        gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
        # trying to add call data without samples fails
        with self.assertRaises(ValueError) as _:
            variations[GT_FIELD] = gts

        # set samples
        variations.samples = ['1', '2', '3']
        self.assertEqual(variations.num_samples, 3)

        # adding again samples fails
        with self.assertRaises(RuntimeError) as _:
            variations.samples = ['1', '2', '3']

        # add variationData
        chroms = np.array(['chr1', 'chr2', 'chr3'])
        variations[CHROM_FIELD] = chroms

        # add data with wrong shape
        with self.assertRaises(ValueError) as context:
            variations[GT_FIELD] = gts = np.array([[1, 2, 3]])
        self.assertIn('Introduced matrix shape', str(context.exception))

        with self.assertRaises(ValueError) as context:
            variations[GT_FIELD] = gts = np.array([[1, 2], [1, 2], [1, 2]])
        self.assertIn('not fit with num samples', str(context.exception))

        # set gt array
        gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
        variations[GT_FIELD] = gts
        self.assertTrue(np.array_equal(gts, variations[GT_FIELD]))
        self.assertEqual(variations.num_variations, 3)
Exemple #13
0
    def test_do_pca_in_memory(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        do_pca(variations)

        gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]],
                        [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]])
        variations = Variations()
        variations.samples = da.from_array(np.array(['a', 'b', 'c']))
        variations[GT_FIELD] = da.from_array(gts)

        res = do_pca(variations)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert np.allclose(projs[0], projs[1])
        assert not np.allclose(projs[0], projs[2])
Exemple #14
0
    def test_kosman_pairwise_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 2]])
        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((a, b, c, d), axis=0)
        gts = np.transpose(gts, axes=(1, 0, 2)).astype(np.int16)

        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts
        distances, samples = calc_kosman_dist(variations)
        expected = [0.33333333, 0.75, 0.75, 0.5, 0.5, 0.]
        assert np.allclose(distances, expected)
Exemple #15
0
    def test_filter_obs_het_in_mem(self):
        variations = Variations()
        gts = np.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                        [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations.samples = da.from_array([1, 2, 3, 4, 5])
        variations[GT_FIELD] = da.from_array(gts)
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']

        filtered = filter_by_obs_heterocigosis(variations, min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0

        filtered = filter_by_obs_heterocigosis(variations,
                                               min_allowable_het=0.2,
                                               min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
        assert filtered[FLT_STATS][N_KEPT] == 3
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 1

        filtered = filter_by_obs_heterocigosis(variations,
                                               min_allowable_het=0.2,
                                               min_num_genotypes=10)
        #         filtered = compute(task, store_variation_to_memory=True,
        #                            silence_runtime_warnings=True)
        assert filtered[FLT_STATS][N_KEPT] == 0
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 4

        filtered = filter_by_obs_heterocigosis(variations,
                                               max_allowable_het=0.1,
                                               min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]])

        filtered = filter_by_obs_heterocigosis(variations,
                                               min_allowable_het=0.2,
                                               max_allowable_het=0.3,
                                               min_num_genotypes=0)
        #         filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
Exemple #16
0
def compute(data,
            store_variation_to_memory=False,
            silence_runtime_warnings=False):
    if isinstance(data, (Delayed, da.Array)):
        with warnings.catch_warnings():
            if silence_runtime_warnings:
                warnings.filterwarnings("ignore", category=RuntimeWarning)
            return data.compute()

    res = _collect_cargo_to_compute(
        data, store_variation_to_memory=store_variation_to_memory)
    darrays_to_compute, orig_keys, orig_dicts, variation_info = res

    in_memory_variations = None

    with warnings.catch_warnings():
        if silence_runtime_warnings:
            warnings.filterwarnings("ignore", category=RuntimeWarning)
        computed_darrays = dask.compute(*darrays_to_compute)

    for idx, computed_darray in enumerate(computed_darrays):
        key = orig_keys[idx]
        dict_in_which_the_result_was_stored = orig_dicts[idx]
        if (isinstance(dict_in_which_the_result_was_stored, Variations)
                and store_variation_to_memory):
            if in_memory_variations is None:
                in_memory_variations = Variations(
                    metadata=variation_info['metadata'])
            if key == 'samples':
                in_memory_variations.samples = computed_darray
            else:
                in_memory_variations[key] = computed_darray
        else:
            dict_in_which_the_result_was_stored[key] = computed_darray

    if variation_info['key']:
        if store_variation_to_memory:
            data[variation_info['key']] = in_memory_variations
        else:
            del data[variation_info['key']]

    return data