Ejemplo n.º 1
0
def filter_by_mac(variations,
                  max_alleles,
                  max_allowable_mac=None,
                  min_allowable_mac=None,
                  filter_id='filter_by_mac',
                  min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                  calc_histogram=False,
                  n_bins=DEF_NUM_BINS,
                  limits=None):
    macs = calc_mac(variations,
                    max_alleles=max_alleles,
                    min_num_genotypes=min_num_genotypes)
    # print(compute(macs))

    result = _select_vars(variations, macs, min_allowable_mac,
                          max_allowable_mac)

    if calc_histogram:
        if limits is None:
            limits = (0, variations.num_samples)
        counts, bin_edges = va.histogram(macs, n_bins=n_bins, limits=limits)
        result[FLT_STATS][COUNT] = counts
        result[FLT_STATS][BIN_EDGES] = bin_edges
        limits = []
        if min_allowable_mac is not None:
            limits.append(min_allowable_mac)
        if max_allowable_mac is not None:
            limits.append(max_allowable_mac)
        result[FLT_STATS]['limits'] = limits

    return {
        FLT_VARS: result[FLT_VARS],
        FLT_ID: filter_id,
        FLT_STATS: result[FLT_STATS]
    }
Ejemplo n.º 2
0
    def test_calc_mac2(self):
        gts = np.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                        [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations = Variations(samples=da.array(samples))
        variations[GT_FIELD] = da.from_array(gts)

        # with this step we create a  variation wi
        result = calc_mac(variations, max_alleles=3, min_num_genotypes=1)
        macs = compute(result)
        assert np.allclose(macs, np.array([4, 2, 3, np.NaN]), equal_nan=True)
Ejemplo n.º 3
0
    def test_calc_mac_in_memory(self):
        variations = Variations(samples=np.array(['aa', 'bb']))

        gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts
        macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0)
        expected = [2, 1, 1, math.nan]
        for a, b in zip(macs, expected):
            if math.isnan(a):
                self.assertTrue(math.isnan(b))
                continue
            self.assertAlmostEqual(a, b, places=2)
Ejemplo n.º 4
0
    def test_calc_mac(self):
        variations = Variations(samples=da.array(['aa', 'bb']))

        gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = da.from_array(gts)
        # with this step we create a  variation with dask arrays of unknown
        # shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0)
        result = compute(macs)
        expected = [2, 1, 1, math.nan]
        for a, b in zip(result, expected):
            if math.isnan(a):
                self.assertTrue(math.isnan(b))
                continue
            self.assertAlmostEqual(a, b, places=2)