def filter_by_mac(variations, max_alleles, max_allowable_mac=None, min_allowable_mac=None, filter_id='filter_by_mac', min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): macs = calc_mac(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) # print(compute(macs)) result = _select_vars(variations, macs, min_allowable_mac, max_allowable_mac) if calc_histogram: if limits is None: limits = (0, variations.num_samples) counts, bin_edges = va.histogram(macs, n_bins=n_bins, limits=limits) result[FLT_STATS][COUNT] = counts result[FLT_STATS][BIN_EDGES] = bin_edges limits = [] if min_allowable_mac is not None: limits.append(min_allowable_mac) if max_allowable_mac is not None: limits.append(max_allowable_mac) result[FLT_STATS]['limits'] = limits return { FLT_VARS: result[FLT_VARS], FLT_ID: filter_id, FLT_STATS: result[FLT_STATS] }
def test_calc_mac2(self): gts = np.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) samples = np.array([str(i) for i in range(gts.shape[1])]) variations = Variations(samples=da.array(samples)) variations[GT_FIELD] = da.from_array(gts) # with this step we create a variation wi result = calc_mac(variations, max_alleles=3, min_num_genotypes=1) macs = compute(result) assert np.allclose(macs, np.array([4, 2, 3, np.NaN]), equal_nan=True)
def test_calc_mac_in_memory(self): variations = Variations(samples=np.array(['aa', 'bb'])) gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = gts macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0) expected = [2, 1, 1, math.nan] for a, b in zip(macs, expected): if math.isnan(a): self.assertTrue(math.isnan(b)) continue self.assertAlmostEqual(a, b, places=2)
def test_calc_mac(self): variations = Variations(samples=da.array(['aa', 'bb'])) gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = da.from_array(gts) # with this step we create a variation with dask arrays of unknown # shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0) result = compute(macs) expected = [2, 1, 1, math.nan] for a, b in zip(result, expected): if math.isnan(a): self.assertTrue(math.isnan(b)) continue self.assertAlmostEqual(a, b, places=2)