def test_biallelic(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = NonBiallelicFilter()
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
    def test_low_dp_gt(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = LowDPGTsToMissingSetter(min_dp=5)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
    def test_filter_samples(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        samples = hdf5.samples[:20]
        flt = SampleFilter(samples)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
    def test_no_filtering(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MafFilter(min_maf=0.1, max_maf=0.9, do_histogram=True,
                        do_filtering=False)
        pipeline.append(flt, id_='filter1')

        vars_out = None
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
    def test_min_mac(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MacFilter(min_mac=10, max_mac=30, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert vars_out['/calls/GT'].shape[0] == 0
        assert result2[FLT_VARS]['/calls/GT'].shape[0] == 0
    def test_field_filter(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = FieldFilter(kept_fields=[GT_FIELD])
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert list(vars_out.keys()) == [GT_FIELD]
        assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
    def test_snp_qual(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = SNPQualFilter(min_qual=100, max_qual=50000, do_histogram=True)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['0']['counts'], result2['counts'])
        assert numpy.allclose(result['0']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #8
0
    def test_snp_qual(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = SNPQualFilter(min_qual=100, max_qual=50000, do_histogram=True)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['0']['counts'], result2['counts'])
        assert numpy.allclose(result['0']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #9
0
    def test_min_maf(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MafFilter(min_maf=0.1, max_maf=0.9, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #10
0
    def test_het(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        samples = hdf5.samples[:20]
        flt = ObsHetFilter(min_het=0.02, max_het=0.5, samples=samples,
                           do_histogram=True)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['0']['counts'], result2['counts'])
        assert numpy.allclose(result['0']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #11
0
    def test_filter_chi2_gt_sample_sets(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        samples1 = hdf5.samples[:20]
        samples2 = hdf5.samples[20:]
        flt = Chi2GtFreqs2SampleSetsFilter(samples1, samples2, min_pval=0.05,
                                           do_histogram=True)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['0']['counts'], result2['counts'])
        assert numpy.allclose(result['0']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #12
0
    def test_het(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        samples = hdf5.samples[:20]
        flt = ObsHetFilter(min_het=0.02,
                           max_het=0.5,
                           samples=samples,
                           do_histogram=True)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['0']['counts'], result2['counts'])
        assert numpy.allclose(result['0']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #13
0
    def test_filter_chi2_gt_sample_sets(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        samples1 = hdf5.samples[:20]
        samples2 = hdf5.samples[20:]
        flt = Chi2GtFreqs2SampleSetsFilter(samples1,
                                           samples2,
                                           min_pval=0.05,
                                           do_histogram=True)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['0']['counts'], result2['counts'])
        assert numpy.allclose(result['0']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #14
0
    def test_fieldpath(self):
        pipeline = Pipeline()
        annot_id = 'test'
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        field = '/variations/info/{}'.format(annot_id)
        annotator = IsVariableAnnotator(annot_id=annot_id,
                                        samples=['1_14_1_gbs', '1_17_1_gbs'])
        pipeline.append(annotator)
        annotator = FieldValueFilter(field_path=field, value=0)
        pipeline.append(annotator)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)
        assert vars_out.num_variations == 484
Exemple #15
0
    def test_low_dp_gt(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = LowDPGTsToMissingSetter(min_dp=5)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #16
0
    def test_biallelic(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = NonBiallelicFilter()
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #17
0
    def test_filter_samples(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        samples = hdf5.samples[:20]
        flt = SampleFilter(samples)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #18
0
    def test_field_filter(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = FieldFilter(kept_fields=[GT_FIELD])
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert list(vars_out.keys()) == [GT_FIELD]
        assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
Exemple #19
0
    def test_pipeline(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MinCalledGTsFilter(min_called=0.1, range_=(0, 1))
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert (result['filter1'][FLT_STATS][N_KEPT] ==
                result2[FLT_STATS][N_KEPT])
        assert result['filter1'][FLT_STATS][TOT] == result2[FLT_STATS][TOT]
        assert (result['filter1'][FLT_STATS][N_FILTERED_OUT] ==
                result2[FLT_STATS][N_FILTERED_OUT])

        # check with no range set
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=0.1, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])

        # With rates False
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=20, rates=False, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert result['filter1']['order'] == 0
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Exemple #20
0
    def test_pipeline(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MinCalledGTsFilter(min_called=0.1, range_=(0, 1))
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert (
            result['filter1'][FLT_STATS][N_KEPT] == result2[FLT_STATS][N_KEPT])
        assert result['filter1'][FLT_STATS][TOT] == result2[FLT_STATS][TOT]
        assert (result['filter1'][FLT_STATS][N_FILTERED_OUT] ==
                result2[FLT_STATS][N_FILTERED_OUT])

        # check with no range set
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=0.1, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])

        # With rates False
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=20, rates=False, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert result['filter1']['order'] == 0
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])