def test_filter_by_call_rate(self): variations = create_dask_variations() pipeline_futures = {} future_result = remove_low_call_rate_vars(variations, min_call_rate=0.5) _add_task_to_pipeline(pipeline_futures, future_result) future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS], min_call_rate=0.5, filter_id='call_rate2') _add_task_to_pipeline(pipeline_futures, future_result2) processed = compute(pipeline_futures, store_variation_to_memory=True) self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0) gts = processed[FLT_VARS][GT_FIELD] self.assertEqual(gts.shape, (5, 3, 2)) self.assertTrue( np.all( processed[FLT_VARS].samples == variations.samples.compute())) self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
def test_filter_by_call_rate_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] pipeline_futures = {} future_result = remove_low_call_rate_vars(variations, min_call_rate=0.5) _add_task_to_pipeline(pipeline_futures, future_result) future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS], min_call_rate=0.5, filter_id='call_rate2') _add_task_to_pipeline(pipeline_futures, future_result2) processed = pipeline_futures self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0) gts = processed[FLT_VARS][GT_FIELD] self.assertEqual(gts.shape, (5, 3, 2)) self.assertTrue( np.all(processed[FLT_VARS].samples == variations.samples)) self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
def test_save_to_zarr(self): zarr_path = TEST_DATA_DIR / 'test.zarr' variations = load_zarr(zarr_path, num_vars_per_chunk=2) # with this step we create a variation with dask arrays of unknown shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] with TemporaryDirectory() as tmp_dir: tmp_path = Path(tmp_dir) delayed_store = prepare_zarr_storage(variations, tmp_path) dask.compute(delayed_store, scheduler='sync') variations2 = load_zarr(tmp_path) self.assertTrue(np.all(variations.samples.compute() == variations2.samples.compute())) for field in VARIATION_FIELDS + CALL_FIELDS: # dont chec if field == QUAL_FIELD: continue original = variations[field] if original is None: continue original = original.compute() new = variations2[field].compute() try: self.assertTrue(np.all(original == new)) except AssertionError: for row in range(original.shape[0]): print(row, original[row, ...], new[row, ...]) raise
def test_save_to_hdf5(self): h5_path = TEST_DATA_DIR / 'test.h5' variations = load_hdf5(h5_path) # h5_path = TEST_DATA_DIR / 'test.zarr' # variations = load_zarr(h5_path) # with this step we create a variation with dask arrays of unknown shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] with NamedTemporaryFile(suffix='.h5') as tmp_dir: tmp_path = Path(tmp_dir.name) delayed_store = prepare_hdf5_storage(variations, tmp_path) dask.compute(delayed_store) variations2 = load_hdf5(tmp_path) self.assertEqual(variations.metadata, variations2.metadata) self.assertTrue(np.all(variations.samples.compute() == variations2.samples.compute())) for field in VARIATION_FIELDS + CALL_FIELDS: # dont chec if field == QUAL_FIELD: continue original = variations[field] if original is None: continue original = original.compute() new = variations2[field].compute() self.assertTrue(np.all(original == new))
def test_calc_obs_het(self): variations = Variations(samples=da.array(['a', 'b', 'c', 'd'])) gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) variations[GT_FIELD] = da.from_array(gts) variations[DP_FIELD] = da.from_array(dps) # with this step we create a variation with dask arrays of unknown shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] het = calc_obs_het(variations, min_num_genotypes=0) self.assertTrue(np.allclose(het.compute(), [0.5, 0])) # het = calc_obs_het(variations, min_num_genotypes=10) # assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) self.assertTrue(np.allclose(het.compute(), [1, 0])) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) self.assertTrue(np.allclose(het.compute(), [0, 0])) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) self.assertTrue(np.allclose(het.compute(), [0.5, 0]))
def test_filter_by_call_rate_twice(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') pipeline_futures = {} # this rate has no sense but I use to remove all calls future_result = remove_low_call_rate_vars(variations, min_call_rate=1.1) pipeline_futures.update(future_result) future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS], min_call_rate=0.5, filter_id='call_rate2') pipeline_futures.update(future_result2) processed = compute(pipeline_futures, store_variation_to_memory=True) self.assertEqual(processed[FLT_STATS], { 'n_kept': 0, 'n_filtered_out': 0 })
def test_filter_and_hist_by_call_rate(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') pipeline_futures = {} future_result = remove_low_call_rate_vars(variations, min_call_rate=0.5, calc_histogram=True) _add_task_to_pipeline(pipeline_futures, future_result) processed = compute(pipeline_futures, store_variation_to_memory=True) self.assertEqual(len(processed[FLT_STATS]['call_rate'][COUNT]), DEF_NUM_BINS) self.assertEqual(len(processed[FLT_STATS]['call_rate'][BIN_EDGES]), DEF_NUM_BINS + 1) self.assertEqual(processed[FLT_STATS]['call_rate']['limits'], [0.5])
def test_filter_and_hist_by_call_rate_in_memory(self): variations = create_dask_variations() variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] pipeline_futures = {} future_result = remove_low_call_rate_vars(variations, min_call_rate=0.5, calc_histogram=True) _add_task_to_pipeline(pipeline_futures, future_result) processed = pipeline_futures self.assertEqual(len(processed[FLT_STATS]['call_rate'][COUNT]), DEF_NUM_BINS) self.assertEqual(len(processed[FLT_STATS]['call_rate'][BIN_EDGES]), DEF_NUM_BINS + 1) self.assertEqual(processed[FLT_STATS]['call_rate']['limits'], [0.5])
def test_calc_mac(self): variations = Variations(samples=da.array(['aa', 'bb'])) gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = da.from_array(gts) # with this step we create a variation with dask arrays of unknown # shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] macs = calc_mac(variations, max_alleles=3, min_num_genotypes=0) result = compute(macs) expected = [2, 1, 1, math.nan] for a, b in zip(result, expected): if math.isnan(a): self.assertTrue(math.isnan(b)) continue self.assertAlmostEqual(a, b, places=2)
def test_gts_to_012mat(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') variations = remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS] gts012 = va.gts_as_mat012(variations[GT_FIELD]) expected = [[-1, 0, 2], [-1, 0, 2], [-1, 0, 2], [1, -1, 0], [-1, -1, -1], [-1, 1, -1], [-1, 1, 2]] self.assertTrue(np.allclose(expected, gts012.compute())) variations = load_zarr(TEST_DATA_DIR / 'test.zarr') gts012 = va.gts_as_mat012(variations[GT_FIELD]) self.assertTrue(np.allclose(expected, gts012.compute())) variations = load_zarr(TEST_DATA_DIR / 'test.zarr') variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] gts012 = va.gts_as_mat012(variations[GT_FIELD]) self.assertTrue(np.allclose(expected, gts012))
def test_calc_maf_by_allele_count(self): variations = Variations(samples=da.array(['aa', 'bb'])) variations[GT_FIELD] = da.from_array([[[-1, 1], [2, 1]], [[-1, -1], [-1, 2]], [[1, -1], [1, 1]]]) variations[RO_FIELD] = da.from_array( np.array([[-1, 8], [-1, -1], [6, 4]])) variations[AO_FIELD] = da.from_array( np.array([[[1, 4], [2, 1]], [[-1, -1], [3, 3]], [[1, 4], [5, 1]]])) # with this step we create a variation with dask arrays of unknown # shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] future_result = calc_maf_by_allele_count(variations, min_num_genotypes=0) result = compute(future_result) expected = [0.5, 0.5, 0.47619048] for a, b in zip(result, expected): self.assertAlmostEqual(a, b, places=2)
def _create_dask_variations(): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') return remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS]
def test_iterate_chunk_pairs(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr', num_vars_per_chunk=1) variations = remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS] for p in iterate_chunk_pairs(variations, max_distance=100000): self.assertTrue(len(p), 2)
def create_non_materialized_snp_filtered_variations(): variations = create_dask_variations() return remove_low_call_rate_vars(variations, min_call_rate=0)[FLT_VARS]
def _create_empty_dask_variations(): variations = _load_one_dask() return remove_low_call_rate_vars(variations, min_call_rate=1.1)[FLT_VARS]