Example #1
0
    def test_calc_obs_het_in_memory(self):
        variations = Variations(samples=np.array(['a', 'b', 'c', 'd']))
        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        variations[GT_FIELD] = gts
        variations[DP_FIELD] = dps

        het = calc_obs_het(variations, min_num_genotypes=0)
        self.assertTrue(np.allclose(het, [0.5, 0]))

        #         het = calc_obs_het(variations, min_num_genotypes=10)
        #         assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        self.assertTrue(np.allclose(het, [1, 0]))
        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        self.assertTrue(np.allclose(het, [0, 0]))

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        self.assertTrue(np.allclose(het, [0.5, 0]))
Example #2
0
    def test_calc_obs_het2(self):

        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations = Variations(samples=da.array(samples))
        variations[GT_FIELD] = da.from_array(gts)
        variations[DP_FIELD] = da.from_array(dps)

        het = calc_obs_het(variations, min_num_genotypes=0)
        het = compute(het)
        assert np.allclose(het, [0.5, 0])
        het = calc_obs_het(variations, min_num_genotypes=10)
        het = compute(het)
        assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        het = compute(het)
        assert np.allclose(het, [1, 0])

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        het = compute(het)
        assert np.allclose(het, [0, 0])

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        het = compute(het)
        assert np.allclose(het, [0.5, 0])
Example #3
0
    def test_calc_obs_het(self):
        variations = Variations(samples=da.array(['a', 'b', 'c', 'd']))
        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        variations[GT_FIELD] = da.from_array(gts)
        variations[DP_FIELD] = da.from_array(dps)
        # with this step we create a  variation with dask arrays of unknown shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        het = calc_obs_het(variations, min_num_genotypes=0)
        self.assertTrue(np.allclose(het.compute(), [0.5, 0]))

        #         het = calc_obs_het(variations, min_num_genotypes=10)
        #         assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        self.assertTrue(np.allclose(het.compute(), [1, 0]))
        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        self.assertTrue(np.allclose(het.compute(), [0, 0]))

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        self.assertTrue(np.allclose(het.compute(), [0.5, 0]))
Example #4
0
def filter_by_obs_heterocigosis(
        variations,
        max_allowable_het=None,
        min_allowable_het=None,
        min_call_dp_for_het_call=None,
        max_call_dp_for_het_call=None,
        filter_id='obs_het',
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
        calc_histogram=False,
        n_bins=DEF_NUM_BINS,
        limits=None):

    obs_het = calc_obs_het(variations,
                           min_num_genotypes=min_num_genotypes,
                           min_call_dp_for_het_call=min_call_dp_for_het_call,
                           max_call_dp_for_het_call=max_call_dp_for_het_call)

    result = _select_vars(variations,
                          obs_het,
                          min_allowable=min_allowable_het,
                          max_allowable=max_allowable_het)
    if calc_histogram:
        if limits is None:
            limits = (0, 1)
        counts, bin_edges = va.histogram(obs_het, n_bins=n_bins, limits=limits)
        result[FLT_STATS][COUNT] = counts
        result[FLT_STATS][BIN_EDGES] = bin_edges
        limits = []
        if min_allowable_het is not None:
            limits.append(min_allowable_het)
        if max_allowable_het is not None:
            limits.append(max_allowable_het)
        result[FLT_STATS]['limits'] = limits

    return {
        FLT_VARS: result[FLT_VARS],
        FLT_ID: filter_id,
        FLT_STATS: result[FLT_STATS]
    }