Ejemplo n.º 1
0
    def test_count_alleles(self):

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunk = first(hdf5.iterate_chunks())
        genotypes = chunk['/calls/GT']
        expected = [[3, 3, 0],
                    [5, 1, 0],
                    [0, 2, 4],
                    [6, 0, 0],
                    [2, 3, 1]]
        counts = counts_by_row(genotypes, missing_value=-1)
        assert numpy.all(expected == counts)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT'])
        chunks = (chunk['/calls/GT'] for chunk in chunks)
        matrix = first(chunks)
        for _ in range(20):
            extend_matrix(matrix, chunks)

        counts = counts_by_row(matrix, missing_value=-1)

        gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6]])

        gts = [[[0, 0], [0, 0], [0, 0]],
               [[0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6, 6]])
Ejemplo n.º 2
0
    def test_count_alleles(self):

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunk = first(hdf5.iterate_chunks())
        genotypes = chunk['/calls/GT']
        expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]]
        counts = counts_by_row(genotypes, missing_value=-1)
        assert numpy.all(expected == counts)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunks = hdf5.iterate_chunks(kept_fields=['/calls/GT'])
        chunks = (chunk['/calls/GT'] for chunk in chunks)
        matrix = first(chunks)
        for _ in range(20):
            extend_matrix(matrix, chunks)

        counts = counts_by_row(matrix, missing_value=-1)

        gts = [[[-1, -1], [-1, -1], [-1, -1], [0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6]])

        gts = [[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [0, 0]]]
        gts = numpy.array(gts)
        counts = counts_by_row(gts, missing_value=-1)
        assert numpy.all(counts == [[6, 6]])
Ejemplo n.º 3
0
 def num_variations(self):
     try:
         one_path = first(self.keys())
     except ValueError:
         return 0
     one_mat = self[one_path]
     return one_mat.shape[0]
Ejemplo n.º 4
0
 def test_count_alleles_by_freq(self):
     h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r')
     # flt = SampleFilter(['V51'])
     # v51 = flt(h5)[FLT_VARS]
     chunk = first(h5.iterate_chunks())
     freqs_by_snp = calc_allele_freq_by_depth(chunk)
     assert numpy.all(freqs_by_snp[0] == [0, 1, 0, 0])
Ejemplo n.º 5
0
 def num_variations(self):
     try:
         one_path = first(self.keys())
     except ValueError:
         return 0
     one_mat = self[one_path]
     return one_mat.shape[0]
Ejemplo n.º 6
0
    def test_filter_biallelic(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = [GT_FIELD]
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)

        flt_chunk = NonBiallelicFilter(report_selection=True)(chunk)
        kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0]
        assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2)
        assert flt_chunk[FLT_STATS][N_KEPT] == kept
        assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK
        assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept
        assert flt_chunk[SELECTED_VARS].shape

        flt_chunk = NonBiallelicFilter(report_selection=True,
                                       reverse=True)(chunk)
        kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0]
        assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2)
        assert flt_chunk[FLT_STATS][N_KEPT] == kept
        assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK
        assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept
        assert flt_chunk[SELECTED_VARS].shape

        variations = VariationsArrays()
        gts = numpy.array([
            [[0, 0], [1, 1], [0, 1]],
            [[0, 0], [0, 0], [0, 0]],
            [[0, 0], [1, 1], [2, 2]],
        ])
        variations[GT_FIELD] = gts
        flt_vars = NonBiallelicFilter()(variations)[FLT_VARS]
        expected = numpy.array([[[0, 0], [1, 1], [0, 1]]])
        assert numpy.all(flt_vars[GT_FIELD] == expected)
Ejemplo n.º 7
0
    def test_filter_samples_by_missing(self):
        variations = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunk = first(variations.iterate_chunks())

        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk, min_called_rate=0.9,
                                       out_vars=new_var)
        assert len(new_var.samples) == 0

        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk, min_called_rate=0.1,
                                       out_vars=new_var)
        assert len(new_var.samples) == len(chunk.samples)

        # check that it works by chunk
        new_var = VariationsArrays()
        res = filter_samples_by_missing_rate(variations, min_called_rate=0.2,
                                             out_vars=new_var,
                                             do_histogram=True)
        new_var2 = VariationsArrays()
        res2 = filter_samples_by_missing_rate(variations, min_called_rate=0.2,
                                              out_vars=new_var2,
                                              chunk_size=None,
                                              do_histogram=True)

        assert res2['missing_rates'].shape[0] == len(variations.samples)
        assert res2['selected_samples'].shape[0] == len(variations.samples)
        assert new_var.samples == new_var2.samples
        assert numpy.all(new_var[GT_FIELD] == new_var2[GT_FIELD])
        assert numpy.allclose(res[EDGES], res2[EDGES])
        assert numpy.all(res[COUNTS][:] == res2[COUNTS][:])
Ejemplo n.º 8
0
    def test_gst(self):
        h5 = VariationsH5(join(TEST_DATA_DIR, 'limon.h5'), mode='r')
        # flt = SampleFilter(['V51'])
        # v51 = flt(h5)[FLT_VARS]
        chunk = first(h5.iterate_chunks())

        dists = calc_gst_per_loci(chunk, populations=[['V51'], ['F49']])
        assert dists[0] == 0
Ejemplo n.º 9
0
 def _create_or_get_mats_from_chunk(self, variations):
     field_paths = variations.keys()
     if first(field_paths) in self:
         matrices = self._get_mats_for_chunk(variations)
     else:
         if self.keys():
             raise ValueError('There are previous no matching matrices')
         matrices = self._create_mats_from_chunks(variations)
         self._set_metadata(variations.metadata)
         self._set_samples(variations.samples)
     return matrices
Ejemplo n.º 10
0
 def _create_or_get_mats_from_chunk(self, variations):
     field_paths = variations.keys()
     if first(field_paths) in self:
         matrices = self._get_mats_for_chunk(variations)
     else:
         if self.keys():
             raise ValueError('There are previous no matching matrices')
         matrices = self._create_mats_from_chunks(variations)
         self._set_metadata(variations.metadata)
         self._set_samples(variations.samples)
     return matrices
Ejemplo n.º 11
0
    def test_set_gt_to_missing_by_dp(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = ['/calls/DP', GT_FIELD]
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)
        set_low_dp_gts_to_missing = LowDPGTsToMissingSetter(min_dp=300)
        res = set_low_dp_gts_to_missing(chunk)
        assert numpy.all(chunk[GT_FIELD][0][147] == [-1, -1])
        assert COUNTS in res

        set_low_dp_gts_to_missing(chunk)
        assert numpy.all(chunk[GT_FIELD].shape[0] == SNPS_PER_CHUNK)
Ejemplo n.º 12
0
    def test_set_gt_to_missing_by_dp(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = ['/calls/DP', GT_FIELD]
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)
        set_low_dp_gts_to_missing = LowDPGTsToMissingSetter(min_dp=300)
        res = set_low_dp_gts_to_missing(chunk)
        assert numpy.all(chunk[GT_FIELD][0][147] == [-1, -1])
        assert COUNTS in res

        set_low_dp_gts_to_missing(chunk)
        assert numpy.all(chunk[GT_FIELD].shape[0] == SNPS_PER_CHUNK)
Ejemplo n.º 13
0
    def test_filter_biallelic(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = [GT_FIELD]
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)

        flt_chunk = NonBiallelicFilter(report_selection=True)(chunk)
        kept = flt_chunk[FLT_VARS][GT_FIELD].shape[0]
        assert flt_chunk[FLT_VARS][GT_FIELD].shape[1:] == (153, 2)
        assert flt_chunk[FLT_STATS][N_KEPT] == kept
        assert flt_chunk[FLT_STATS][TOT] == SNPS_PER_CHUNK
        assert flt_chunk[FLT_STATS][N_FILTERED_OUT] == SNPS_PER_CHUNK - kept
        assert flt_chunk[SELECTED_VARS].shape
Ejemplo n.º 14
0
    def test_filter_samples_by_missing(self):
        variations = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunk = first(variations.iterate_chunks())

        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk,
                                       min_called_rate=0.9,
                                       out_vars=new_var)
        assert len(new_var.samples) == 0

        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk,
                                       min_called_rate=0.1,
                                       out_vars=new_var)
        assert len(new_var.samples) == len(chunk.samples)

        # for some samples
        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk,
                                       min_called_rate=0.1,
                                       out_vars=new_var,
                                       samples=['1_18_4_gbs', '1_19_4_gbs'])
        assert new_var.samples == ['1_18_4_gbs', '1_19_4_gbs']

        # for het
        new_var = VariationsArrays()
        filter_samples_by_missing_rate(chunk, max_het=0.001, out_vars=new_var)
        assert new_var.samples == [
            '1_35_2_gbs', '4_136B_2_gbs', '4_5_5_gbs', '5_66B_3_gbs'
        ]

        # check that it works by chunk
        new_var = VariationsArrays()
        res = filter_samples_by_missing_rate(variations,
                                             min_called_rate=0.2,
                                             out_vars=new_var,
                                             do_histogram=True)
        new_var2 = VariationsArrays()
        res2 = filter_samples_by_missing_rate(variations,
                                              min_called_rate=0.2,
                                              out_vars=new_var2,
                                              chunk_size=None,
                                              do_histogram=True)

        assert res2['missing_rates'].shape[0] == len(variations.samples)
        assert res2['selected_samples'].shape[0] == len(variations.samples)
        assert new_var.samples == new_var2.samples
        assert numpy.all(new_var[GT_FIELD] == new_var2[GT_FIELD])
        assert numpy.allclose(res[EDGES], res2[EDGES])
        assert numpy.all(res[COUNTS][:] == res2[COUNTS][:])
Ejemplo n.º 15
0
def _extend_array_with_iter(array, matrices):
    try:
        matrix = first(matrices)
    except ValueError:
        return

    matrices = chain([matrix], matrices)

    matrix_size = sys.getsizeof(matrix)
    mats_in_group = math.floor(AVAILABLE_MEM / matrix_size)
    if not mats_in_group:
        mats_in_group = 1
    for mats_in_mem in group_items(matrices, mats_in_group):
        _extend_array(array, mats_in_mem)
Ejemplo n.º 16
0
def _extend_array_with_iter(array, matrices):
    try:
        matrix = first(matrices)
    except ValueError:
        return

    matrices = chain([matrix], matrices)

    matrix_size = sys.getsizeof(matrix)
    mats_in_group = math.floor(AVAILABLE_MEM / matrix_size)
    if not mats_in_group:
        mats_in_group = 1
    for mats_in_mem in group_items(matrices, mats_in_group):
        _extend_array(array, mats_in_mem)
Ejemplo n.º 17
0
    def test_filter_quality_snps(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1]], [[0, 1], [1, 1]],
                           [[0, 0], [0, 0]], [[0, 0], [0, 0]],
                           [[0, 1], [0, 0]]])
        snp_quals = numpy.array([5, 10, 15, 5, 20])
        variations[GT_FIELD] = gts
        variations['/variations/qual'] = snp_quals

        filtered = SNPQualFilter(report_selection=True)(variations)
        filtered_qual = filtered[FLT_VARS]['/variations/qual']
        filtered_gts = filtered[FLT_VARS][GT_FIELD]
        assert numpy.all(variations['/variations/qual'] == filtered_qual)
        assert numpy.all(variations[GT_FIELD] == filtered_gts)
        assert filtered[SELECTED_VARS].shape

        expected_gts = numpy.array([[[0, 0], [0, 0]],
                                    [[0, 1], [0, 0]]])
        exp_snp_quals = numpy.array([15, 20])
        filtered = SNPQualFilter(min_qual=15)(variations)
        assert numpy.all(filtered[FLT_VARS]['/variations/qual'] ==
                         exp_snp_quals)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected_gts)
        assert filtered[FLT_STATS][N_KEPT] == 2
        assert filtered[FLT_STATS][TOT] == 5
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 3

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = ['/variations/qual']
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)
        flt_chunk = SNPQualFilter(min_qual=530)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 126

        flt_chunk = SNPQualFilter()(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == SNPS_PER_CHUNK

        flt_chunk = SNPQualFilter(max_qual=1000)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 92

        flt_chunk = SNPQualFilter(min_qual=530, max_qual=1000)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 18

        flt_chunk = SNPQualFilter(min_qual=586325202)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == 0

        flt_chunk = SNPQualFilter(max_qual=-1)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == 0
Ejemplo n.º 18
0
    def test_filter_quality_snps(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [1, 1]], [[0, 1], [1, 1]], [[0, 0], [0,
                                                                         0]],
                           [[0, 0], [0, 0]], [[0, 1], [0, 0]]])
        snp_quals = numpy.array([5, 10, 15, 5, 20])
        variations[GT_FIELD] = gts
        variations['/variations/qual'] = snp_quals

        filtered = SNPQualFilter(report_selection=True)(variations)
        filtered_qual = filtered[FLT_VARS]['/variations/qual']
        filtered_gts = filtered[FLT_VARS][GT_FIELD]
        assert numpy.all(variations['/variations/qual'] == filtered_qual)
        assert numpy.all(variations[GT_FIELD] == filtered_gts)
        assert filtered[SELECTED_VARS].shape

        expected_gts = numpy.array([[[0, 0], [0, 0]], [[0, 1], [0, 0]]])
        exp_snp_quals = numpy.array([15, 20])
        filtered = SNPQualFilter(min_qual=15)(variations)
        assert numpy.all(
            filtered[FLT_VARS]['/variations/qual'] == exp_snp_quals)
        assert numpy.all(filtered[FLT_VARS][GT_FIELD] == expected_gts)
        assert filtered[FLT_STATS][N_KEPT] == 2
        assert filtered[FLT_STATS][TOT] == 5
        assert filtered[FLT_STATS][N_FILTERED_OUT] == 3

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        kept_fields = ['/variations/qual']
        snps = hdf5.iterate_chunks(kept_fields=kept_fields)
        chunk = first(snps)
        flt_chunk = SNPQualFilter(min_qual=530)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 126

        flt_chunk = SNPQualFilter()(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == SNPS_PER_CHUNK

        flt_chunk = SNPQualFilter(max_qual=1000)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 92

        flt_chunk = SNPQualFilter(min_qual=530, max_qual=1000)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] > 18

        flt_chunk = SNPQualFilter(min_qual=math.inf)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == 20

        flt_chunk = SNPQualFilter(max_qual=-1)(chunk)[FLT_VARS]
        assert first(flt_chunk.values()).shape[0] == 0
Ejemplo n.º 19
0
    def test_count_value_per_row(self):
        mat = numpy.array([[0, 0], [1, -1], [2, -1], [-1, -1]])
        missing_counter = row_value_counter_fact(value=-1)
        assert numpy.all(missing_counter(mat) == [0, 1, 1, 2])

        missing_counter = row_value_counter_fact(value=-1, ratio=True)
        assert numpy.allclose(missing_counter(mat), [0., 0.5, 0.5, 1.])

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunks = list(hdf5.iterate_chunks())
        gt_chunk = first(select_dset_from_chunks(chunks, '/calls/GT'))

        homo_counter = row_value_counter_fact(value=2)
        assert numpy.all(homo_counter(gt_chunk) == [0, 0, 4, 0, 1])

        missing_counter = row_value_counter_fact(value=2, ratio=True)
        expected = [0., 0, 0.66666, 0., 0.166666]
        assert numpy.allclose(missing_counter(gt_chunk), expected)
        hdf5.close()
Ejemplo n.º 20
0
    def test_count_value_per_row(self):
        mat = numpy.array([[0, 0], [1, -1], [2, -1], [-1, -1]])
        missing_counter = row_value_counter_fact(value=-1)
        assert numpy.all(missing_counter(mat) == [0, 1, 1, 2])

        missing_counter = row_value_counter_fact(value=-1, ratio=True)
        assert numpy.allclose(missing_counter(mat), [0., 0.5, 0.5, 1.])

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        chunks = list(hdf5.iterate_chunks())
        gt_chunk = first(select_dset_from_chunks(chunks, '/calls/GT'))

        homo_counter = row_value_counter_fact(value=2)
        assert numpy.all(homo_counter(gt_chunk) == [0, 0, 4, 0, 1])

        missing_counter = row_value_counter_fact(value=2, ratio=True)
        expected = [0., 0, 0.66666, 0., 0.166666]
        assert numpy.allclose(missing_counter(gt_chunk), expected)
        hdf5.close()
Ejemplo n.º 21
0
def _filter_samples_by_index(variations,
                             sample_cols,
                             filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.dtype(bool))
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [
        samples[idx] for idx, keep in enumerate(sample_cols) if keep
    ]
    filtered_vars.samples = kept_samples
    return filtered_vars
Ejemplo n.º 22
0
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.bool)
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols)
                    if keep]
    filtered_vars.samples = kept_samples
    return filtered_vars
Ejemplo n.º 23
0
    def test_filter_mafs(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunk = first(hdf5.iterate_chunks())
        filtered = MafFilter(min_maf=0.6,
                             min_num_genotypes=0,
                             report_selection=True)(chunk)
        tot = filtered[FLT_STATS][N_KEPT] + filtered[FLT_STATS][N_FILTERED_OUT]
        assert tot == SNPS_PER_CHUNK
        assert filtered[FLT_STATS][TOT] == SNPS_PER_CHUNK
        assert filtered[SELECTED_VARS].shape

        flt_chunk = filtered[FLT_VARS]

        path = first(chunk.keys())
        assert flt_chunk[path].shape[0]

        filtered = MafFilter()(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] == SNPS_PER_CHUNK

        filtered = MafFilter(max_maf=0.6)(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] > 18

        filtered = MafFilter(min_maf=0.6, max_maf=0.9,
                             min_num_genotypes=0)(chunk)
        flt_chunk = filtered[FLT_VARS]
        assert flt_chunk[path].shape[0] > 125

        filtered = MafFilter(min_maf=1.1, min_num_genotypes=0)(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] == 0

        filtered = MafFilter(max_maf=0, min_num_genotypes=0)(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] == 0
Ejemplo n.º 24
0
    def test_filter_mafs(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        chunk = first(hdf5.iterate_chunks())
        filtered = MafFilter(min_maf=0.6, min_num_genotypes=0,
                             report_selection=True)(chunk)
        tot = filtered[FLT_STATS][N_KEPT] + filtered[FLT_STATS][N_FILTERED_OUT]
        assert tot == SNPS_PER_CHUNK
        assert filtered[FLT_STATS][TOT] == SNPS_PER_CHUNK
        assert filtered[SELECTED_VARS].shape

        flt_chunk = filtered[FLT_VARS]

        path = first(chunk.keys())
        assert flt_chunk[path].shape[0]

        filtered = MafFilter()(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] == SNPS_PER_CHUNK

        filtered = MafFilter(max_maf=0.6)(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] > 18

        filtered = MafFilter(min_maf=0.6, max_maf=0.9,
                             min_num_genotypes=0)(chunk)
        flt_chunk = filtered[FLT_VARS]
        assert flt_chunk[path].shape[0] > 125

        filtered = MafFilter(min_maf=1.1, min_num_genotypes=0)(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] == 0

        filtered = MafFilter(max_maf=0, min_num_genotypes=0)(chunk)
        flt_chunk = filtered[FLT_VARS]
        path = first(chunk.keys())
        assert flt_chunk[path].shape[0] == 0