Esempio n. 1
0
def iterate_chunk_pairs(variations, max_distance, chunk_size=DEF_CHUNK_SIZE):
    chunks = list(variations.iterate_chunks(chunk_size))
    computed_chunks = {}
    for index1, chunk1 in enumerate(chunks):
        computed1 = compute({'vars': chunk1}, store_variation_to_memory=True,
                            silence_runtime_warnings=True)['vars']
        computed_chunks[index1] = computed1

        chunk1_end_pos = computed1[POS_FIELD][-1]
        chunk1_end_chrom = computed1[CHROM_FIELD][-1]

        for index, chunk2 in enumerate(chunks[index1:]):
            index2 = index1 + index
            if index2 in computed_chunks:
                computed2 = computed_chunks[index2]
            else:
                computed2 = compute({'vars': chunk2},
                                    store_variation_to_memory=True,
                                silence_runtime_warnings=True)['vars']
            if index2 != index1:
                chunk2_start_chrom = computed2[CHROM_FIELD][0]
                if chunk1_end_chrom != chunk2_start_chrom:
                    break
                chunk2_start_pos = computed2[POS_FIELD][0]
                if chunk2_start_pos - chunk1_end_pos > max_distance:
                    break

            yield computed1, computed2

        # remove from computed_chunks those with index minor than index1
        try:
            del computed_chunks[index1 - 1]
        except KeyError:
            pass
Esempio n. 2
0
    def test_calc_missing(self):
        variations = create_non_materialized_snp_filtered_variations()
        variations = keep_samples(variations, samples=['pepo',
                                                       'upv196'])[FLT_VARS]
        task = calc_missing_gt(variations, rates=False)
        result = compute({'num_missing_gts': task})
        self.assertTrue(
            np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1]))
        variations = create_non_materialized_snp_filtered_variations()
        variations = keep_samples(variations, samples=['pepo',
                                                       'upv196'])[FLT_VARS]
        task = calc_missing_gt(variations, rates=True)
        result = compute({'num_missing_gts': task})
        expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5]
        for a, b in zip(result['num_missing_gts'], expected):
            self.assertAlmostEqual(a, b, places=2)

        variations = create_dask_variations()
        task = calc_missing_gt_per_sample(variations, rates=True)
        result = compute({'num_missing_gts': task})

        variations = create_non_materialized_snp_filtered_variations()
        try:
            task = calc_missing_gt_per_sample(variations, rates=True)
            self.fail('NotMaterializedError expected')
        except NotMaterializedError:
            pass

        variations = create_dask_variations()
        task = calc_missing_gt_per_sample(variations, rates=False)
        result = compute({'num_missing_gts': task})
Esempio n. 3
0
    def test_calc_obs_het2(self):

        gts = np.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                        [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = np.array([[5, 12, 10, 10], [10, 10, 10, 10]])
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations = Variations(samples=da.array(samples))
        variations[GT_FIELD] = da.from_array(gts)
        variations[DP_FIELD] = da.from_array(dps)

        het = calc_obs_het(variations, min_num_genotypes=0)
        het = compute(het)
        assert np.allclose(het, [0.5, 0])
        het = calc_obs_het(variations, min_num_genotypes=10)
        het = compute(het)
        assert np.allclose(het, [np.NaN, np.NaN], equal_nan=True)

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=10)
        het = compute(het)
        assert np.allclose(het, [1, 0])

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           max_call_dp_for_het_call=11)
        het = compute(het)
        assert np.allclose(het, [0, 0])

        het = calc_obs_het(variations,
                           min_num_genotypes=0,
                           min_call_dp_for_het_call=5)
        het = compute(het)
        assert np.allclose(het, [0.5, 0])
Esempio n. 4
0
    def test_kosman_2_indis(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]

        assert distance == 1 / 3

        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0

        variations = Variations()
        gts = np.stack((b, d), axis=1)
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0.45
Esempio n. 5
0
    def test_filter_by_call_rate_in_memory(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        pipeline_futures = {}

        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=0.5)
        _add_task_to_pipeline(pipeline_futures, future_result)

        future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS],
                                                   min_call_rate=0.5,
                                                   filter_id='call_rate2')
        _add_task_to_pipeline(pipeline_futures, future_result2)
        processed = pipeline_futures
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0)

        gts = processed[FLT_VARS][GT_FIELD]
        self.assertEqual(gts.shape, (5, 3, 2))
        self.assertTrue(
            np.all(processed[FLT_VARS].samples == variations.samples))
        self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
Esempio n. 6
0
 def test_calc_missing(self):
     variations = _create_dask_variations()
     variations = keep_samples(variations, samples=['pepo',
                                                    'upv196'])[FLT_VARS]
     task = calc_missing_gt(variations, rates=False)
     result = compute({'num_missing_gts': task})
     self.assertTrue(
         np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1]))
     variations = _create_dask_variations()
     variations = keep_samples(variations, samples=['pepo',
                                                    'upv196'])[FLT_VARS]
     task = calc_missing_gt(variations, rates=True)
     result = compute({'num_missing_gts': task})
     expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5]
     for a, b in zip(result['num_missing_gts'], expected):
         self.assertAlmostEqual(a, b, places=2)
Esempio n. 7
0
    def test_compute_vars_to_disk(self):
        zarr_path = TEST_DATA_DIR / 'test.zarr'
        variations = load_zarr(zarr_path)
        da1 = da.from_array(np.array([1, 2, 3, 4, 5]))
        da2 = da.from_array(np.array([6, 7, 8, 9, 0]))
        da3 = da1 + da2

        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            delayed_vars = prepare_zarr_storage(variations, tmp_path)
            initial = {
                'vars': delayed_vars,
                'data': {
                    'da1': da1,
                    'da2': da2,
                    'da3': da3
                }
            }
            processed = compute(initial)
            variations2 = load_zarr(tmp_path)
            self.assertTrue(
                np.all(variations.samples.compute() ==
                       variations2.samples.compute()))
            self.assertTrue(
                np.all(variations[GT_FIELD].compute() ==
                       variations2[GT_FIELD].compute()))
            self.assertTrue(
                np.all(processed['data']['da1'] == [1, 2, 3, 4, 5]))
            self.assertTrue(
                np.all(processed['data']['da3'] == [7, 9, 11, 13, 5]))
Esempio n. 8
0
    def test_filter_by_call_rate(self):
        variations = create_dask_variations()
        pipeline_futures = {}

        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=0.5)
        _add_task_to_pipeline(pipeline_futures, future_result)

        future_result2 = remove_low_call_rate_vars(future_result[FLT_VARS],
                                                   min_call_rate=0.5,
                                                   filter_id='call_rate2')
        _add_task_to_pipeline(pipeline_futures, future_result2)

        processed = compute(pipeline_futures, store_variation_to_memory=True)
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_FILTERED_OUT], 2)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_KEPT], 5)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_FILTERED_OUT], 0)

        gts = processed[FLT_VARS][GT_FIELD]
        self.assertEqual(gts.shape, (5, 3, 2))
        self.assertTrue(
            np.all(
                processed[FLT_VARS].samples == variations.samples.compute()))
        self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
Esempio n. 9
0
    def _test_filter_samples_by_call_rate(self, variations, do_computation):

        tasks = {}
        task = remove_low_call_rate_samples(variations,
                                            min_call_rate=0.5,
                                            filter_id='call_rate')
        _add_task_to_pipeline(tasks, task)
        task2 = remove_low_call_rate_samples(task[FLT_VARS],
                                             min_call_rate=0.5,
                                             filter_id='call_rate2')
        _add_task_to_pipeline(tasks, task2)

        if do_computation:
            processed_tasks = compute(tasks, store_variation_to_memory=True)
        else:
            processed_tasks = tasks
        processed = processed_tasks
        self.assertEqual(processed[FLT_STATS]['call_rate'][N_SAMPLES_KEPT], 2)
        self.assertEqual(
            processed[FLT_STATS]['call_rate'][N_SAMPLES_FILTERED_OUT], 1)
        self.assertEqual(processed[FLT_STATS]['call_rate2'][N_SAMPLES_KEPT], 2)
        self.assertEqual(
            processed[FLT_STATS]['call_rate2'][N_SAMPLES_FILTERED_OUT], 0)

        gts = processed[FLT_VARS][GT_FIELD]
        self.assertEqual(gts.shape, (7, 2, 2))
        self.assertTrue(
            np.all(processed[FLT_VARS].samples == ['mu16', 'upv196']))
        self.assertEqual(processed[FLT_VARS].metadata, variations.metadata)
Esempio n. 10
0
def summarize_variations(in_zarr_path,
                         out_dir_path,
                         draw_missin_rate=True,
                         draw_mac=True,
                         draw_maf=True,
                         draw_obs_het=True,
                         min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET,
                         min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                         num_bins=DEF_NUM_BINS,
                         silence_runtime_warnings=True):
    stats = {}
    variations = load_zarr(in_zarr_path)
    max_alleles = variations[ALT_FIELD].shape[1]
    num_variations = variations.num_variations
    num_samples = variations.num_samples

    if draw_missin_rate:
        _stats = calc_called_gt(variations, rates=True)
        counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1))
        stats['called'] = {'counts': counts, 'edges': edges}

    if draw_mac:
        _stats = calc_mac(variations, max_alleles, min_num_genotypes)
        counts, edges = va.histogram(_stats,
                                     n_bins=num_bins,
                                     limits=(0, variations.num_samples))
        stats['mac'] = {'counts': counts, 'edges': edges}

    if draw_maf:
        _stats = calc_maf_by_gt(variations, max_alleles, min_num_genotypes)
        counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1))
        stats['maf'] = {'counts': counts, 'edges': edges}

    if draw_obs_het:
        _stats = calc_obs_het(
            variations,
            min_num_genotypes=min_num_genotypes,
            min_call_dp_for_het_call=min_call_dp_for_het_call)
        counts, edges = va.histogram(_stats, n_bins=num_bins, limits=(0, 1))
        stats['obs_heterocigosity'] = {'counts': counts, 'edges': edges}

    computed_stats = compute(stats,
                             silence_runtime_warnings=silence_runtime_warnings)

    for kind, stats in computed_stats.items():
        with (out_dir_path / f'{kind}.png').open('wb') as out_fhand:
            plot_histogram(stats['counts'],
                           stats['edges'],
                           out_fhand,
                           log_scale=True)

    with (out_dir_path / 'stats.txt').open('w') as fhand:
        fhand.write(f'STATS FOR: {in_zarr_path.name}\n')
        fhand.write('-----------' + '-' * len(in_zarr_path.name) + '\n')
        fhand.write(f'Num. variations: {num_variations}\n')
        fhand.write(f'Num. samples: {num_samples}\n')
        fhand.write('\n')
Esempio n. 11
0
 def test_mac_filter(self):
     variations = create_dask_variations(num_vars_per_chunk=2)
     task = filter_by_mac(variations, max_allowable_mac=1, max_alleles=3)
     result = compute(task,
                      store_variation_to_memory=True,
                      silence_runtime_warnings=True)
     filtered_vars = result[FLT_VARS]
     self.assertEqual(filtered_vars.num_variations, 0)
     self.assertEqual(result[FLT_STATS], {'n_kept': 0, 'n_filtered_out': 7})
Esempio n. 12
0
    def test_remove_samples(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        samples = ['upv196', 'pepo']

        task = remove_samples(variations, samples=samples)
        processed = compute(task, store_variation_to_memory=True)
        dps = processed[FLT_VARS][DP_FIELD]
        self.assertTrue(np.all(['mu16'] == processed[FLT_VARS].samples))
        expected = [[10], [9], [9], [-1], [-1], [9], [10]]
        self.assertTrue(np.all(dps == expected))
Esempio n. 13
0
    def test_filter_obs_het(self):
        variations = Variations()
        gts = np.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]],
                        [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]],
                        [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]])
        variations.samples = da.from_array([1, 2, 3, 4, 5])
        variations[GT_FIELD] = da.from_array(gts)
        task = filter_by_obs_heterocigosis(variations, min_num_genotypes=0)
        filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts)
        assert filtered[FLT_STATS][N_KEPT] == 4
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 0

        task = filter_by_obs_heterocigosis(variations,
                                           min_allowable_het=0.2,
                                           min_num_genotypes=0)
        filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
        assert filtered[FLT_STATS][N_KEPT] == 3
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 1

        task = filter_by_obs_heterocigosis(variations,
                                           min_allowable_het=0.2,
                                           min_num_genotypes=10)
        filtered = compute(task,
                           store_variation_to_memory=True,
                           silence_runtime_warnings=True)
        assert filtered[FLT_STATS][N_KEPT] == 0
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 4

        task = filter_by_obs_heterocigosis(variations,
                                           max_allowable_het=0.1,
                                           min_num_genotypes=0)
        filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]])

        task = filter_by_obs_heterocigosis(variations,
                                           min_allowable_het=0.2,
                                           max_allowable_het=0.3,
                                           min_num_genotypes=0)
        filtered = compute(task, store_variation_to_memory=True)
        assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
Esempio n. 14
0
 def test_expected_het_with_real(self):
     variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
     max_alleles = variations[ALT_FIELD].shape[1] + 1
     task = calc_expected_het(variations,
                              max_alleles=max_alleles,
                              min_num_genotypes=0)
     result = compute(task, silence_runtime_warnings=True)
     np.testing.assert_allclose(result,
                                [0.5, 0.5, 0.5, 0.375, np.nan, 0.5, 0.375],
                                equal_nan=True)
Esempio n. 15
0
 def test_mac_filter_in_memory(self):
     variations = load_zarr(TEST_DATA_DIR / 'test.zarr',
                            num_vars_per_chunk=2)
     variations = compute({'vars': variations},
                          store_variation_to_memory=True)['vars']
     result = filter_by_mac(variations, max_allowable_mac=1, max_alleles=3)
     #         result = compute(task, store_variation_to_memory=True,
     #                          silence_runtime_warnings=True)
     filtered_vars = result[FLT_VARS]
     self.assertEqual(filtered_vars.num_variations, 0)
     self.assertEqual(result[FLT_STATS], {'n_kept': 0, 'n_filtered_out': 7})
Esempio n. 16
0
def zarr_to_vcf(zarr_path,
                out_fhand,
                vcf_format=VCF_FORMAT,
                chunk_size=DEF_CHUNK_SIZE):
    variations = load_zarr(zarr_path)
    _write_vcf_meta(variations, out_fhand, vcf_format)
    _write_vcf_header(variations, out_fhand)
    for chunk in variations.iterate_chunks(chunk_size=chunk_size):
        in_mem_chunk = compute({'vars': chunk},
                               store_variation_to_memory=True)['vars']
        _write_snvs(in_mem_chunk, out_fhand)
Esempio n. 17
0
    def test_calc_mac2(self):
        gts = np.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                        [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations = Variations(samples=da.array(samples))
        variations[GT_FIELD] = da.from_array(gts)

        # with this step we create a  variation wi
        result = calc_mac(variations, max_alleles=3, min_num_genotypes=1)
        macs = compute(result)
        assert np.allclose(macs, np.array([4, 2, 3, np.NaN]), equal_nan=True)
Esempio n. 18
0
 def test_remove_variations_in_regions(self):
     variations, regions = self._create_fake_variations_and_regions()
     task = remove_variations_in_regions(variations, regions)
     result = compute(task, store_variation_to_memory=True)
     chroms = result[FLT_VARS][CHROM_FIELD]
     poss = result[FLT_VARS][POS_FIELD]
     self.assertTrue(np.all(poss == [1, 2, 3, 6, 7, 8, 9, 0]))
     self.assertTrue(
         np.all(chroms == [
             'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'
         ]))
Esempio n. 19
0
    def test_maf_by_allele_count_filter(self):
        variations = create_dask_variations()

        task = filter_by_maf_by_allele_count(variations,
                                             max_allowable_maf=0.6,
                                             min_num_genotypes=2)
        result = compute(task,
                         store_variation_to_memory=True,
                         silence_runtime_warnings=True)
        filtered_vars = result[FLT_VARS]
        self.assertEqual(filtered_vars.num_variations, 4)
        self.assertEqual(result[FLT_STATS], {'n_kept': 4, 'n_filtered_out': 3})
Esempio n. 20
0
    def test_remove_samples_in_memory(self):
        variations = create_dask_variations()
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']

        samples = ['upv196', 'pepo']

        processed = remove_samples(variations, samples=samples)
        dps = processed[FLT_VARS][DP_FIELD]
        self.assertTrue(np.all(['mu16'] == processed[FLT_VARS].samples))
        expected = [[10], [9], [9], [-1], [-1], [9], [10]]
        self.assertTrue(np.all(dps == expected))
Esempio n. 21
0
 def test_maf_filter(self):
     variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
     task = filter_by_maf(variations,
                          max_allowable_maf=0.6,
                          max_alleles=3,
                          min_num_genotypes=2)
     result = compute(task,
                      store_variation_to_memory=True,
                      silence_runtime_warnings=True)
     filtered_vars = result[FLT_VARS]
     self.assertEqual(filtered_vars.num_variations, 3)
     self.assertEqual(result[FLT_STATS], {'n_kept': 3, 'n_filtered_out': 4})
Esempio n. 22
0
 def test_maf_filter_in_memory(self):
     variations = create_dask_variations()
     variations = compute({'vars': variations},
                          store_variation_to_memory=True)['vars']
     result = filter_by_maf(variations,
                            max_allowable_maf=0.6,
                            max_alleles=3,
                            min_num_genotypes=2)
     #         result = compute(task, store_variation_to_memory=True,
     #                          silence_runtime_warnings=True)
     filtered_vars = result[FLT_VARS]
     self.assertEqual(filtered_vars.num_variations, 3)
     self.assertEqual(result[FLT_STATS], {'n_kept': 3, 'n_filtered_out': 4})
Esempio n. 23
0
    def test_filter_and_hist_by_call_rate(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        pipeline_futures = {}

        future_result = remove_low_call_rate_vars(variations,
                                                  min_call_rate=0.5,
                                                  calc_histogram=True)
        _add_task_to_pipeline(pipeline_futures, future_result)
        processed = compute(pipeline_futures, store_variation_to_memory=True)
        self.assertEqual(len(processed[FLT_STATS]['call_rate'][COUNT]),
                         DEF_NUM_BINS)
        self.assertEqual(len(processed[FLT_STATS]['call_rate'][BIN_EDGES]),
                         DEF_NUM_BINS + 1)
        self.assertEqual(processed[FLT_STATS]['call_rate']['limits'], [0.5])
Esempio n. 24
0
    def test_calc_diversities(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        max_alleles = variations[ALT_FIELD].shape[1] + 1

        task = calc_diversities(variations,
                                max_alleles=max_alleles,
                                min_call_dp_for_het_call=0,
                                min_num_genotypes=0,
                                polymorphic_threshold=0.5)
        result = compute(task, silence_runtime_warnings=True)
        self.assertAlmostEqual(result['num_variable_vars'], 6)
        self.assertAlmostEqual(result['num_polymorphic_vars'], 4, places=2)
        self.assertAlmostEqual(result['exp_het'], 0.458, places=2)
        self.assertAlmostEqual(result['obs_het'], 0.333, places=2)
Esempio n. 25
0
 def test_ld_random_pairs_from_different_chroms_in_memory(self):
     variations = load_zarr(TEST_DATA_DIR / 'tomato.apeki_gbs.calmd.zarr',
                            num_vars_per_chunk=200)
     max_alleles = variations[ALT_FIELD].shape[1]
     variations = filter_by_maf(variations, max_alleles=max_alleles,
                                max_allowable_maf=0.98)[FLT_VARS]
     variations = compute({'vars': variations},
                          store_variation_to_memory=True,
                          silence_runtime_warnings=True)['vars']
     lds = calc_ld_random_pairs_from_different_chroms(variations, 100,
                                                      max_maf=0.98,
                                                      silence_runtime_warnings=True)
     lds = list(lds)
     self.assertEqual(len(lds), 100)
Esempio n. 26
0
    def test_calc_maf_by_gt2(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0)

        # res = compute(mafs, silence_runtime_warnings=True)
        counts, edges = va.histogram(mafs, n_bins=5, limits=(0, 1))
        cc = compute({
            'counts': counts,
            'edges': edges
        },
                     silence_runtime_warnings=True)
        self.assertTrue(np.all(cc['counts'] == [0, 0, 4, 2, 0]))
        self.assertTrue(
            np.all(np.isclose(cc['edges'], [0, 0.2, 0.4, 0.6, 0.8, 1])))
Esempio n. 27
0
 def test_keep_variations_in_regions_in_memory(self):
     variations, regions = self._create_fake_variations_and_regions()
     variations = compute({'vars': variations},
                          store_variation_to_memory=True)['vars']
     result = keep_variations_in_regions(variations, regions)
     #         result = compute(task, store_variation_to_memory=True)
     chroms = result[FLT_VARS][CHROM_FIELD]
     poss = result[FLT_VARS][POS_FIELD]
     self.assertTrue(np.all(poss == [4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0]))
     self.assertTrue(
         np.all(chroms == [
             'chr1', 'chr1', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2', 'chr2',
             'chr2', 'chr2', 'chr2', 'chr2'
         ]))
Esempio n. 28
0
    def test_non_variable_filter(self):
        variations = Variations(samples=da.array(['aa', 'bb']))

        gts = np.array([[[0, 0], [0, 0]], [[0, 2], [1, -1]], [[0, 0], [1, 1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = da.from_array(gts)

        task = keep_variable_variations(variations, max_alleles=3)

        result = compute(task, store_variation_to_memory=True)

        filtered_vars = result[FLT_VARS]
        self.assertEqual(filtered_vars.num_variations, 2)
        self.assertEqual(result[FLT_STATS], {'n_kept': 2, 'n_filtered_out': 2})
Esempio n. 29
0
    def test_maf_by_allele_count_filter_in_memory(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')

        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']

        result = filter_by_maf_by_allele_count(variations,
                                               max_allowable_maf=0.6,
                                               min_num_genotypes=2)
        #         result = compute(task, store_variation_to_memory=True,
        #                          silence_runtime_warnings=True)
        filtered_vars = result[FLT_VARS]
        self.assertEqual(filtered_vars.num_variations, 4)
        self.assertEqual(result[FLT_STATS], {'n_kept': 4, 'n_filtered_out': 3})
Esempio n. 30
0
    def test_keep_samples_in_memory(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        #         print(variations.samples.compute())
        #         print(variations[DP_FIELD].compute())
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        samples = ['upv196', 'pepo']
        processed = keep_samples(variations, samples=samples)
        dps = processed[FLT_VARS][DP_FIELD]

        self.assertTrue(
            np.all(processed[FLT_VARS].samples == ['pepo', 'upv196']))
        expected = [[-1, 9], [-1, 8], [-1, 8], [14, 6], [-1, -1], [-1, -1],
                    [-1, 6]]
        self.assertTrue(np.all(dps == expected))