def iterate_chunk_pairs(variations, max_distance, chunk_size=DEF_CHUNK_SIZE): chunks = list(variations.iterate_chunks(chunk_size)) computed_chunks = {} for index1, chunk1 in enumerate(chunks): computed1 = compute({'vars': chunk1}, store_variation_to_memory=True, silence_runtime_warnings=True)['vars'] computed_chunks[index1] = computed1 chunk1_end_pos = computed1[POS_FIELD][-1] chunk1_end_chrom = computed1[CHROM_FIELD][-1] for index, chunk2 in enumerate(chunks[index1:]): index2 = index1 + index if index2 in computed_chunks: computed2 = computed_chunks[index2] else: computed2 = compute({'vars': chunk2}, store_variation_to_memory=True, silence_runtime_warnings=True)['vars'] if index2 != index1: chunk2_start_chrom = computed2[CHROM_FIELD][0] if chunk1_end_chrom != chunk2_start_chrom: break chunk2_start_pos = computed2[POS_FIELD][0] if chunk2_start_pos - chunk1_end_pos > max_distance: break yield computed1, computed2 # remove from computed_chunks those with index minor than index1 try: del computed_chunks[index1 - 1] except KeyError: pass
def test_calc_missing(self): variations = create_non_materialized_snp_filtered_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=False) result = compute({'num_missing_gts': task}) self.assertTrue( np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1])) variations = create_non_materialized_snp_filtered_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=True) result = compute({'num_missing_gts': task}) expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5] for a, b in zip(result['num_missing_gts'], expected): self.assertAlmostEqual(a, b, places=2) variations = create_dask_variations() task = calc_missing_gt_per_sample(variations, rates=True) result = compute({'num_missing_gts': task}) variations = create_non_materialized_snp_filtered_variations() try: task = calc_missing_gt_per_sample(variations, rates=True) self.fail('NotMaterializedError expected') except NotMaterializedError: pass variations = create_dask_variations() task = calc_missing_gt_per_sample(variations, rates=False) result = compute({'num_missing_gts': task})
def test_calc_obs_het2(self): gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]], [[0, 0], [0, 0], [0, -1], [-1, -1]]]) dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]]) samples = np.array([str(i) for i in range(gts.shape[1])]) variations = Variations(samples=da.array(samples)) variations[GT_FIELD] = da.from_array(gts) variations[DP_FIELD] = da.from_array(dps) het = calc_obs_het(variations, min_num_genotypes=0) het = compute(het) assert np.allclose(het, [0.5, 0]) het = calc_obs_het(variations, min_num_genotypes=10) het = compute(het) assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=10) het = compute(het) assert np.allclose(het, [1, 0]) het = calc_obs_het(variations, min_num_genotypes=0, max_call_dp_for_het_call=11) het = compute(het) assert np.allclose(het, [0, 0]) het = calc_obs_het(variations, min_num_genotypes=0, min_call_dp_for_het_call=5) het = compute(het) assert np.allclose(het, [0.5, 0])
def test_kosman_2_indis(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 1 / 3 c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0 variations = Variations() gts = np.stack((b, d), axis=1) samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0.45
def test_filter_by_call_rate_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] pipeline_futures = {} future_result = remove_low_call_rate_vars(variations, min_call_rate=0.5) _add_task_to_pipeline(pipeline_futures, future_result) future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS], min_call_rate=0.5, filter_id='call_rate2') _add_task_to_pipeline(pipeline_futures, future_result2) processed = pipeline_futures self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0) gts = processed[FLT_VARS][GT_FIELD] self.assertEqual(gts.shape, (5, 3, 2)) self.assertTrue( np.all(processed[FLT_VARS].samples == variations.samples)) self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
def test_calc_missing(self): variations = _create_dask_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=False) result = compute({'num_missing_gts': task}) self.assertTrue( np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1])) variations = _create_dask_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=True) result = compute({'num_missing_gts': task}) expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5] for a, b in zip(result['num_missing_gts'], expected): self.assertAlmostEqual(a, b, places=2)
def test_compute_vars_to_disk(self): zarr_path = TEST_DATA_DIR / 'test.zarr' variations = load_zarr(zarr_path) da1 = da.from_array(np.array([1, 2, 3, 4, 5])) da2 = da.from_array(np.array([6, 7, 8, 9, 0])) da3 = da1 + da2 with TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) delayed_vars = prepare_zarr_storage(variations, tmp_path) initial = { 'vars': delayed_vars, 'data': { 'da1': da1, 'da2': da2, 'da3': da3 } } processed = compute(initial) variations2 = load_zarr(tmp_path) self.assertTrue( np.all(variations.samples.compute() == variations2.samples.compute())) self.assertTrue( np.all(variations[GT_FIELD].compute() == variations2[GT_FIELD].compute())) self.assertTrue( np.all(processed['data']['da1'] == [1, 2, 3, 4, 5])) self.assertTrue( np.all(processed['data']['da3'] == [7, 9, 11, 13, 5]))
def test_filter_by_call_rate(self): variations = create_dask_variations() pipeline_futures = {} future_result = remove_low_call_rate_vars(variations, min_call_rate=0.5) _add_task_to_pipeline(pipeline_futures, future_result) future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS], min_call_rate=0.5, filter_id='call_rate2') _add_task_to_pipeline(pipeline_futures, future_result2) processed = compute(pipeline_futures, store_variation_to_memory=True) self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0) gts = processed[FLT_VARS][GT_FIELD] self.assertEqual(gts.shape, (5, 3, 2)) self.assertTrue( np.all( processed[FLT_VARS].samples == variations.samples.compute())) self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
def _test_filter_samples_by_call_rate(self, variations, do_computation): tasks = {} task = remove_low_call_rate_samples(variations, min_call_rate=0.5, filter_id='call_rate') _add_task_to_pipeline(tasks, task) task2 = remove_low_call_rate_samples(task[FLT_VARS], min_call_rate=0.5, filter_id='call_rate2') _add_task_to_pipeline(tasks, task2) if do_computation: processed_tasks = compute(tasks, store_variation_to_memory=True) else: processed_tasks = tasks processed = processed_tasks self.assertEqual(processed[FLT_STATS]['call_rate'][N_SAMPLES_KEPT], 2) self.assertEqual( processed[FLT_STATS]['call_rate'][N_SAMPLES_FILTERED_OUT], 1) self.assertEqual(processed[FLT_STATS]['call_rate2'][N_SAMPLES_KEPT], 2) self.assertEqual( processed[FLT_STATS]['call_rate2'][N_SAMPLES_FILTERED_OUT], 0) gts = processed[FLT_VARS][GT_FIELD] self.assertEqual(gts.shape, (7, 2, 2)) self.assertTrue( np.all(processed[FLT_VARS].samples == ['mu16', 'upv196'])) self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
def summarize_variations(in_zarr_path, out_dir_path, draw_missin_rate=True, draw_mac=True, draw_maf=True, draw_obs_het=True, min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, num_bins=DEF_NUM_BINS, silence_runtime_warnings=True): stats = {} variations = load_zarr(in_zarr_path) max_alleles = variations[ALT_FIELD].shape[1] num_variations = variations.num_variations num_samples = variations.num_samples if draw_missin_rate: _stats = calc_called_gt(variations, rates=True) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1)) stats['called'] = {'counts': counts, 'edges': edges} if draw_mac: _stats = calc_mac(variations, max_alleles, min_num_genotypes) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, variations.num_samples)) stats['mac'] = {'counts': counts, 'edges': edges} if draw_maf: _stats = calc_maf_by_gt(variations, max_alleles, min_num_genotypes) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1)) stats['maf'] = {'counts': counts, 'edges': edges} if draw_obs_het: _stats = calc_obs_het( variations, min_num_genotypes=min_num_genotypes, min_call_dp_for_het_call=min_call_dp_for_het_call) counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1)) stats['obs_heterocigosity'] = {'counts': counts, 'edges': edges} computed_stats = compute(stats, silence_runtime_warnings=silence_runtime_warnings) for kind, stats in computed_stats.items(): with (out_dir_path / f'{kind}.png').open('wb') as out_fhand: plot_histogram(stats['counts'], stats['edges'], out_fhand, log_scale=True) with (out_dir_path / 'stats.txt').open('w') as fhand: fhand.write(f'STATS FOR: {in_zarr_path.name}\n') fhand.write('-----------' + '-' * len(in_zarr_path.name) + '\n') fhand.write(f'Num. variations: {num_variations}\n') fhand.write(f'Num. samples: {num_samples}\n') fhand.write('\n')
def test_mac_filter(self): variations = create_dask_variations(num_vars_per_chunk=2) task = filter_by_mac(variations, max_allowable_mac=1, max_alleles=3) result = compute(task, store_variation_to_memory=True, silence_runtime_warnings=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 0) self.assertEqual(result[FLT_STATS], {'n_kept': 0, 'n_filtered_out': 7})
def test_remove_samples(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') samples = ['upv196', 'pepo'] task = remove_samples(variations, samples=samples) processed = compute(task, store_variation_to_memory=True) dps = processed[FLT_VARS][DP_FIELD] self.assertTrue(np.all(['mu16'] == processed[FLT_VARS].samples)) expected = [[10], [9], [9], [-1], [-1], [9], [10]] self.assertTrue(np.all(dps == expected))
def test_filter_obs_het(self): variations = Variations() gts = np.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]]) variations.samples = da.from_array([1, 2, 3, 4, 5]) variations[GT_FIELD] = da.from_array(gts) task = filter_by_obs_heterocigosis(variations, min_num_genotypes=0) filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts) assert filtered[FLT_STATS][N_KEPT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 0 task = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, min_num_genotypes=0) filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]]) assert filtered[FLT_STATS][N_KEPT] == 3 assert filtered[FLT_STATS][N_FILTERED_OUT] == 1 task = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, min_num_genotypes=10) filtered = compute(task, store_variation_to_memory=True, silence_runtime_warnings=True) assert filtered[FLT_STATS][N_KEPT] == 0 assert filtered[FLT_STATS][N_FILTERED_OUT] == 4 task = filter_by_obs_heterocigosis(variations, max_allowable_het=0.1, min_num_genotypes=0) filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]]) task = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, max_allowable_het=0.3, min_num_genotypes=0) filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
def test_expected_het_with_real(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') max_alleles = variations[ALT_FIELD].shape[1] + 1 task = calc_expected_het(variations, max_alleles=max_alleles, min_num_genotypes=0) result = compute(task, silence_runtime_warnings=True) np.testing.assert_allclose(result, [0.5, 0.5, 0.5, 0.375, np.nan, 0.5, 0.375], equal_nan=True)
def test_mac_filter_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr', num_vars_per_chunk=2) variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] result = filter_by_mac(variations, max_allowable_mac=1, max_alleles=3) # result = compute(task, store_variation_to_memory=True, # silence_runtime_warnings=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 0) self.assertEqual(result[FLT_STATS], {'n_kept': 0, 'n_filtered_out': 7})
def zarr_to_vcf(zarr_path, out_fhand, vcf_format=VCF_FORMAT, chunk_size=DEF_CHUNK_SIZE): variations = load_zarr(zarr_path) _write_vcf_meta(variations, out_fhand, vcf_format) _write_vcf_header(variations, out_fhand) for chunk in variations.iterate_chunks(chunk_size=chunk_size): in_mem_chunk = compute({'vars': chunk}, store_variation_to_memory=True)['vars'] _write_snvs(in_mem_chunk, out_fhand)
def test_calc_mac2(self): gts = np.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]], [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]]) samples = np.array([str(i) for i in range(gts.shape[1])]) variations = Variations(samples=da.array(samples)) variations[GT_FIELD] = da.from_array(gts) # with this step we create a variation wi result = calc_mac(variations, max_alleles=3, min_num_genotypes=1) macs = compute(result) assert np.allclose(macs, np.array([4, 2, 3, np.NaN]), equal_nan=True)
def test_remove_variations_in_regions(self): variations, regions = self._create_fake_variations_and_regions() task = remove_variations_in_regions(variations, regions) result = compute(task, store_variation_to_memory=True) chroms = result[FLT_VARS][CHROM_FIELD] poss = result[FLT_VARS][POS_FIELD] self.assertTrue(np.all(poss == [1, 2, 3, 6, 7, 8, 9, 0])) self.assertTrue( np.all(chroms == [ 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1' ]))
def test_maf_by_allele_count_filter(self): variations = create_dask_variations() task = filter_by_maf_by_allele_count(variations, max_allowable_maf=0.6, min_num_genotypes=2) result = compute(task, store_variation_to_memory=True, silence_runtime_warnings=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 4) self.assertEqual(result[FLT_STATS], {'n_kept': 4, 'n_filtered_out': 3})
def test_remove_samples_in_memory(self): variations = create_dask_variations() variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] samples = ['upv196', 'pepo'] processed = remove_samples(variations, samples=samples) dps = processed[FLT_VARS][DP_FIELD] self.assertTrue(np.all(['mu16'] == processed[FLT_VARS].samples)) expected = [[10], [9], [9], [-1], [-1], [9], [10]] self.assertTrue(np.all(dps == expected))
def test_maf_filter(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') task = filter_by_maf(variations, max_allowable_maf=0.6, max_alleles=3, min_num_genotypes=2) result = compute(task, store_variation_to_memory=True, silence_runtime_warnings=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 3) self.assertEqual(result[FLT_STATS], {'n_kept': 3, 'n_filtered_out': 4})
def test_maf_filter_in_memory(self): variations = create_dask_variations() variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] result = filter_by_maf(variations, max_allowable_maf=0.6, max_alleles=3, min_num_genotypes=2) # result = compute(task, store_variation_to_memory=True, # silence_runtime_warnings=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 3) self.assertEqual(result[FLT_STATS], {'n_kept': 3, 'n_filtered_out': 4})
def test_filter_and_hist_by_call_rate(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') pipeline_futures = {} future_result = remove_low_call_rate_vars(variations, min_call_rate=0.5, calc_histogram=True) _add_task_to_pipeline(pipeline_futures, future_result) processed = compute(pipeline_futures, store_variation_to_memory=True) self.assertEqual(len(processed[FLT_STATS]['call_rate'][COUNT]), DEF_NUM_BINS) self.assertEqual(len(processed[FLT_STATS]['call_rate'][BIN_EDGES]), DEF_NUM_BINS + 1) self.assertEqual(processed[FLT_STATS]['call_rate']['limits'], [0.5])
def test_calc_diversities(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') max_alleles = variations[ALT_FIELD].shape[1] + 1 task = calc_diversities(variations, max_alleles=max_alleles, min_call_dp_for_het_call=0, min_num_genotypes=0, polymorphic_threshold=0.5) result = compute(task, silence_runtime_warnings=True) self.assertAlmostEqual(result['num_variable_vars'], 6) self.assertAlmostEqual(result['num_polymorphic_vars'], 4, places=2) self.assertAlmostEqual(result['exp_het'], 0.458, places=2) self.assertAlmostEqual(result['obs_het'], 0.333, places=2)
def test_ld_random_pairs_from_different_chroms_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'tomato.apeki_gbs.calmd.zarr', num_vars_per_chunk=200) max_alleles = variations[ALT_FIELD].shape[1] variations = filter_by_maf(variations, max_alleles=max_alleles, max_allowable_maf=0.98)[FLT_VARS] variations = compute({'vars': variations}, store_variation_to_memory=True, silence_runtime_warnings=True)['vars'] lds = calc_ld_random_pairs_from_different_chroms(variations, 100, max_maf=0.98, silence_runtime_warnings=True) lds = list(lds) self.assertEqual(len(lds), 100)
def test_calc_maf_by_gt2(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0) # res = compute(mafs, silence_runtime_warnings=True) counts, edges = va.histogram(mafs, n_bins=5, limits=(0, 1)) cc = compute({ 'counts': counts, 'edges': edges }, silence_runtime_warnings=True) self.assertTrue(np.all(cc['counts'] == [0, 0, 4, 2, 0])) self.assertTrue( np.all(np.isclose(cc['edges'], [0, 0.2, 0.4, 0.6, 0.8, 1])))
def test_keep_variations_in_regions_in_memory(self): variations, regions = self._create_fake_variations_and_regions() variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] result = keep_variations_in_regions(variations, regions) # result = compute(task, store_variation_to_memory=True) chroms = result[FLT_VARS][CHROM_FIELD] poss = result[FLT_VARS][POS_FIELD] self.assertTrue(np.all(poss == [4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0])) self.assertTrue( np.all(chroms == [ 'chr1', 'chr1', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2' ]))
def test_non_variable_filter(self): variations = Variations(samples=da.array(['aa', 'bb'])) gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = da.from_array(gts) task = keep_variable_variations(variations, max_alleles=3) result = compute(task, store_variation_to_memory=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 2) self.assertEqual(result[FLT_STATS], {'n_kept': 2, 'n_filtered_out': 2})
def test_maf_by_allele_count_filter_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] result = filter_by_maf_by_allele_count(variations, max_allowable_maf=0.6, min_num_genotypes=2) # result = compute(task, store_variation_to_memory=True, # silence_runtime_warnings=True) filtered_vars = result[FLT_VARS] self.assertEqual(filtered_vars.num_variations, 4) self.assertEqual(result[FLT_STATS], {'n_kept': 4, 'n_filtered_out': 3})
def test_keep_samples_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') # print(variations.samples.compute()) # print(variations[DP_FIELD].compute()) variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] samples = ['upv196', 'pepo'] processed = keep_samples(variations, samples=samples) dps = processed[FLT_VARS][DP_FIELD] self.assertTrue( np.all(processed[FLT_VARS].samples == ['pepo', 'upv196'])) expected = [[-1, 9], [-1, 8], [-1, 8], [14, 6], [-1, -1], [-1, -1], [-1, 6]] self.assertTrue(np.all(dps == expected))