def test_calc_missing_gt_rates(self):
        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        called_vars = calc_called_gt(varis, rates=False)
        assert called_vars.shape[0] == 0
        called_vars = calc_called_gt(varis, rates=True)
        assert called_vars.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        arrays = VariationsArrays()
        arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        rates = calc_missing_gt(arrays)
        rates2 = calc_missing_gt(hdf5)
        assert rates.shape == (943,)
        assert numpy.allclose(rates, rates2)
        assert numpy.min(rates) == 0
        assert numpy.all(rates <= 1)

        gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]],
                           [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]])
        varis = {'/calls/GT': gts}
        expected = numpy.array([2, 1, 1, 0])
        called_vars = calc_called_gt(varis, rates=False)
        assert numpy.all(called_vars == expected)

        missing_vars = calc_missing_gt(varis, rates=False)
        assert numpy.all(missing_vars == 2 - expected)

        expected = numpy.array([0, 0.5, 0.5, 1])
        rates = calc_called_gt(varis)
        assert numpy.allclose(rates, 1 - expected)

        rates = calc_missing_gt(varis)
        assert numpy.allclose(rates, expected)
    def test_calculate_hwe(self):
        variations = VariationsArrays()
        gts = numpy.array([])
        variations['/calls/GT'] = gts
        variations['/variations/alt'] = gts
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert result.shape[0] == 0

        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [1, 0], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [1, 0], [1, 1], [0, 0]]])
        variations['/calls/GT'] = gts
        variations._create_matrix('/variations/alt', shape=(1, 1),
                                  dtype=numpy.int16, fillvalue=0)
        expected = numpy.array([[1.25825397e+01, 1.85240619e-03],
                                [1.25825397e+01, 1.85240619e-03]])
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert numpy.allclose(result, expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test1 = calc_hwe_chi2_test(hdf5, chunk_size=None)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test2 = calc_hwe_chi2_test(hdf5)
        assert numpy.allclose(hwe_test1, hwe_test2, equal_nan=True)
    def test_calc_obs_het(self):
        gts = numpy.array([])
        dps = numpy.array([])
        varis = {'/calls/GT': gts, '/calls/DP': dps}
        het = calc_obs_het(varis, min_num_genotypes=0)
        assert het.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        het_h5 = calc_obs_het(hdf5, min_num_genotypes=0)
        het_array = calc_obs_het(snps, min_num_genotypes=0)
        assert numpy.all(het_array == het_h5)

        gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = numpy.array([[5, 12, 10, 10],
                           [10, 10, 10, 10]])

        varis = {'/calls/GT': gts, '/calls/DP': dps}
        het = calc_obs_het(varis, min_num_genotypes=0)
        assert numpy.allclose(het, [0.5, 0])

        het = calc_obs_het(varis, min_num_genotypes=10)
        assert numpy.allclose(het, [numpy.NaN, numpy.NaN], equal_nan=True)

        het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=10)
        assert numpy.allclose(het, [1, 0])

        het = calc_obs_het(varis, min_num_genotypes=0, max_call_dp=11)
        assert numpy.allclose(het, [0, 0])

        het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=5)
        assert numpy.allclose(het, [0.5, 0])
Beispiel #4
0
    def test_merge_with_depth(self):

        vars1 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'],
                           'gts': numpy.array([[0, 0], [1, 1]]),
                           'dp': numpy.array([1, 1])}])
        vars2 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'],
                           'gts': numpy.array([[0, 0], [1, 1]]),
                           'dp': numpy.array([20, 20])}])
        vars1.samples = ['a', 'b']
        vars2.samples = ['c', 'd']
        merger = MockMerger(gt_shape=(4, 2))

        variation = VarMerger._merge_vars(merger, vars1[0], vars2[0])
        exp = {'gts': [[0, 0], [1, 1], [0, 0], [1, 1]], 'pos': 1,
               'ref': b'A', 'chrom': '1', 'alt': [b'T'], 'dp': [1, 1, 20, 20]}
        self.var_is_equal(exp, variation)

        # merge the same var with depth
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)

        first_snv_merged_depth = numpy.array([1, 8, 5, 1, 8, 5],
                                             dtype=numpy.int16)
        depth = list(merger.variations)[0][8][1]
        assert depth[0] == b'DP'
        assert numpy.all(depth[1] == first_snv_merged_depth)
        new_vars.put_vars(merger)
        assert '/calls/DP' in new_vars.keys()
        assert numpy.all(new_vars['/calls/DP'][0] == first_snv_merged_depth)
    def test_calc_obs_het_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        het_h5 = calc_obs_het_by_sample(hdf5)
        het_array = calc_obs_het_by_sample(snps)
        assert numpy.all(het_array == het_h5)

        gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1]]])

        varis = {'/calls/GT': gts}
        het = calc_obs_het_by_sample(varis, chunk_size=None)
        assert numpy.allclose(het, [0, 1 / 3, 0, numpy.NaN], equal_nan=True)

        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        het = calc_obs_het_by_sample(varis, chunk_size=None)
        assert het.shape[0] == 0

        snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        calc_obs_het_by_sample(snps, min_call_dp=3)
        calc_obs_het_by_sample(snps, min_call_dp=3, max_call_dp=20)
        het_0 = calc_obs_het_by_sample(snps)
        het = calc_obs_het_by_sample(snps, chunk_size=None)
        assert numpy.allclose(het_0, het)
    def test_iterate_wins(self):
        fpath = join(TEST_DATA_DIR, 'ril.hdf5')
        hd5 = VariationsH5(fpath, mode='r')
        wins = hd5.iterate_wins(win_size=1000000)

        hd5_2 = VariationsArrays()
        hd5_2.put_chunks(wins)
        numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
 def test_delete_item_from_variationArray(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, pre_read_max_size=1000)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     del snps['/calls/GT']
     assert '/calls/GT' not in snps.keys()
     vcf_fhand.close()
 def test_put_vars_arrays_from_vcf(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, pre_read_max_size=1000)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     assert snps['/calls/GT'].shape == (5, 3, 2)
     assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])
     expected = numpy.array([48, 48, 43], dtype=numpy.int16)
     assert numpy.all(snps['/calls/GQ'][0, :] == expected)
     vcf_fhand.close()
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
Beispiel #10
0
    def test_ignore_non_matching(self):

        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)
        assert new_vars.num_variations == 1
Beispiel #11
0
    def test_field_filter(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = FieldFilter(kept_fields=[GT_FIELD])
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert list(vars_out.keys()) == [GT_FIELD]
        assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
Beispiel #12
0
    def test_field_filter(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = FieldFilter(kept_fields=[GT_FIELD])
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert list(vars_out.keys()) == [GT_FIELD]
        assert list(result2[FLT_VARS].keys()) == [GT_FIELD]
Beispiel #13
0
    def test_min_mac(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MacFilter(min_mac=10, max_mac=30, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert not vars_out.keys()

        assert result2[FLT_VARS]['/calls/GT'].shape[0] == 0
Beispiel #14
0
 def test_mat012(self):
     gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]],
                        [[0, 0], [0, 0], [1, 1], [2, 2]],
                        [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
     varis = VariationsArrays()
     varis[GT_FIELD] = gts
     gts012 = varis.gts_as_mat012
     expected = [[0, 1, 2, -1], [0, 0, 2, 2], [-1, -1, -1, -1]]
     assert numpy.allclose(gts012, expected, equal_nan=True)
    def test_pca(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        do_pca(hdf5)

        varis = VariationsArrays()
        gts = [[[0, 0], [0, 0], [1, 1]],
               [[0, 0], [0, 0], [1, 1]],
               [[0, 0], [0, 0], [1, 1]],
               [[0, 0], [0, 0], [1, 1]],
               ]
        gts = numpy.array(gts)
        varis[GT_FIELD] = gts
        varis.samples = ['a', 'b', 'c']
        res = do_pca(varis)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert numpy.allclose(projs[0], projs[1])
        assert not numpy.allclose(projs[0], projs[2])
Beispiel #16
0
    def test_chunk_pairs(self):
        poss = [5, 7, 8, 10, 11, 12]
        chroms = ['c1'] * len(poss)
        poss = numpy.array(poss)
        chroms = numpy.array(chroms)
        varis = VariationsArrays()
        varis[POS_FIELD] = poss
        varis[CHROM_FIELD] = chroms

        pairs = list(varis.iterate_chunk_pairs(max_dist=3, chunk_size=2))
        pos_pairs = [(pair['chunk1'][POS_FIELD][0], pair['chunk2'][POS_FIELD][0]) for pair in pairs]
        expected = [(5, 5), (5, 8), (8, 8), (8, 11), (11, 11)]
        assert pos_pairs == expected

        pairs = list(varis.iterate_chunk_pairs(max_dist=4, chunk_size=2))
        pos_pairs = [(pair['chunk1'][POS_FIELD][0], pair['chunk2'][POS_FIELD][0]) for pair in pairs]
        expected = [(5, 5), (5, 8), (5, 11), (8, 8), (8, 11), (11, 11)]
        assert pos_pairs == expected
Beispiel #17
0
 def test_sort_variations(self):
     fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb')
     var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom2', 'pos': 345},
                 b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346},
                 b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 325}}
     parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t',
                        max_field_lens={'alt': 3},
                        max_field_str_lens={'chrom': 10, 'alt': 10})
     variations = VariationsArrays(ignore_overflows=True,
                                   ignore_undefined_fields=True)
     variations.put_vars(parser)
     sorted_vars = VariationsArrays()
     sort_variations(variations, sorted_vars)
     exp_chrom = [b'chrom1', b'chrom1', b'chrom2']
     exp_pos = [325, 346, 345]
     assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom)
     assert numpy.all(sorted_vars['/variations/pos'] == exp_pos)
     fhand.close()
Beispiel #18
0
    def test_calc_dp_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        cnts, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs',
                                                    n_bins=15)
        assert cnts['hom'].shape == (15,)
        assert cnts['het'].shape == (15,)
        return
        cnts2, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs',
                                                     n_bins=15,
                                                     chunk_size=None)
        assert numpy.all(cnts['hom'] == cnts2['hom'])
        assert numpy.all(cnts['het'] == cnts2['het'])

        cnts3, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs',
                                                     n_bins=15, chunk_size=50)
        assert numpy.all(cnts['hom'] == cnts3['hom'])
        assert numpy.all(cnts['het'] == cnts3['het'])
Beispiel #19
0
    def test_by_chunks(self):
        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays()
        snps.put_vars(vcf_parser)
        fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays(vars_in_chunk=1)
        snps.put_vars(vcf_parser)
        fhand.close()
Beispiel #20
0
    def test_calc_snp_density(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        density_h5 = list(calc_snp_density(hdf5, 1000))
        density_array = list(calc_snp_density(snps, 1000))
        assert density_array == density_h5
        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [6, 7, 7, 7, 7, 7, 6, 1, 1, 1, 1, 1, 2, 2]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['c1', 'c4', 'c5', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array([]),
               '/variations/pos': numpy.array([])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == []

        var = {'/variations/chrom': numpy.array([1]),
               '/variations/pos': numpy.array([1])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == [1]
    def test_calc_dp_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        cnts, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs',
                                                    n_bins=15)
        assert cnts['hom'].shape == (15,)
        assert cnts['het'].shape == (15,)
        return
        cnts2, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs',
                                                     n_bins=15,
                                                     chunk_size=None)
        assert numpy.all(cnts['hom'] == cnts2['hom'])
        assert numpy.all(cnts['het'] == cnts2['het'])

        cnts3, _ = calc_call_dp_distrib_for_a_sample(hdf5, sample='1_17_1_gbs',
                                                     n_bins=15, chunk_size=50)
        assert numpy.all(cnts['hom'] == cnts3['hom'])
        assert numpy.all(cnts['het'] == cnts3['het'])
    def test_calc_snp_density(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        density_h5 = list(calc_snp_density(hdf5, 1000))
        density_array = list(calc_snp_density(snps, 1000))
        assert density_array == density_h5
        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch', 'ch',
                                                 'ch', 'ch', 'ch', 'ch']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [6, 7, 7, 7, 7, 7, 6, 1, 1, 1, 1, 1, 2, 2]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['ch', 'ch', 'ch', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array(['c1', 'c4', 'c5', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c2', 'c2',
                                                 'c2', 'c2', 'c2', 'c3']),
               '/variations/pos': numpy.array([1, 2, 3, 4, 5, 6, 7, 25, 34, 44,
                                               80, 200, 300, 302])}
        dens_var = list(calc_snp_density(var, 11))
        expected = [1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1]
        assert dens_var == expected

        var = {'/variations/chrom': numpy.array([]),
               '/variations/pos': numpy.array([])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == []

        var = {'/variations/chrom': numpy.array([1]),
               '/variations/pos': numpy.array([1])}
        dens_var = list(calc_snp_density(var, 11))
        assert dens_var == [1]
Beispiel #23
0
    def test_pca(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        do_pca(hdf5)

        varis = VariationsArrays()
        gts = [
            [[0, 0], [0, 0], [1, 1]],
            [[0, 0], [0, 0], [1, 1]],
            [[0, 0], [0, 0], [1, 1]],
            [[0, 0], [0, 0], [1, 1]],
        ]
        gts = numpy.array(gts)
        varis[GT_FIELD] = gts
        varis.samples = ['a', 'b', 'c']
        res = do_pca(varis)
        projs = res['projections']
        assert projs.shape[0] == gts.shape[1]
        assert numpy.allclose(projs[0], projs[1])
        assert not numpy.allclose(projs[0], projs[2])
Beispiel #24
0
    def test_nei_dist(self):

        gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                           [[1, 1], [1, 2], [2, 2], [2, 1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1,
                                                      chunk_size=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        # all missing
        gts = numpy.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isnan(dists[0])

        # min_num_genotypes
        gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                           [[1, 1], [1, 2], [2, 2], [2, 1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      chunk_size=1)
        assert math.isnan(dists[0])
Beispiel #25
0
 def test_write_meta_header(self):
     files = ['format_def_without_info.vcf',
              'format_def_without_filter.vcf',
              'format_without_flt_info_qual.vcf']
     for file in files:
         vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb')
         header_lines = [line for line in vcf_fhand if line.startswith(b'#')]
         vcf_fhand.close()
         with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand:
             vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2},
                             pre_read_max_size=10000)
             var_array = VariationsArrays(ignore_undefined_fields=True)
             var_array.put_vars(vcf)
             with NamedTemporaryFile(suffix='.h5') as tmp_fhand:
                 _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0')
                 _write_vcf_header(var_array, tmp_fhand)
                 tmp_fhand.flush()
                 with open(tmp_fhand.name, 'rb') as retmp_fhand:
                     for line in retmp_fhand:
                         assert line in header_lines
    def test_samples(self):
        gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]],
                           [[0, 0], [0, 0], [1, 1], [2, 2]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        assert varis.samples == [1, 2, 3, 4]

        # With another file
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()
        fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, pre_read_max_size=1000)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()
        samples = h5.samples
        samples[0] = '0'
        h5.samples = samples
Beispiel #27
0
    def test_samples(self):
        gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]],
                           [[0, 0], [0, 0], [1, 1], [2, 2]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        assert varis.samples == [1, 2, 3, 4]

        # With another file
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()
        fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()
        samples = h5.samples
        samples[0] = '0'
        h5.samples = samples
Beispiel #28
0
    def test_kosman_pairwise_by_chunk(self):
        a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1],
                         [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                         [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        gts = numpy.stack((a, b, c, d), axis=0)
        gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16)
        variations = VariationsArrays()
        variations['/calls/GT'] = gts
        expected = [0.33333333, 0.75, 0.75, 0.45, 0.45, 0.]
        distance = calc_pairwise_distance(variations,
                                          chunk_size=None,
                                          min_num_snps=1)
        assert numpy.allclose(distance, expected)

        distance = calc_pairwise_distance(variations, chunk_size=2)
        assert numpy.allclose(distance, expected)

        distance = calc_pairwise_distance(variations,
                                          chunk_size=None,
                                          min_num_snps=11)
        assert numpy.sum(numpy.isnan(distance)) == 5

        # With all missing
        a = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16)
        b = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16)
        gts = numpy.stack((a, b), axis=0)
        gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16)
        variations = VariationsArrays()
        variations['/calls/GT'] = gts
        distance = calc_pairwise_distance(variations)
        assert numpy.isnan(distance[0])

        # With missing in some chunks only
        variations['/calls/GT'][:5, 0, :] = 1
        variations['/calls/GT'][:5, 1, :] = 0
        assert calc_pairwise_distance(variations)[0] == 1
        assert calc_pairwise_distance(variations, chunk_size=3)[0] == 1
Beispiel #29
0
    def test_calculate_hwe(self):
        variations = VariationsArrays()
        gts = numpy.array([])
        variations['/calls/GT'] = gts
        variations['/variations/alt'] = gts
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert result.shape[0] == 0

        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [1, 0], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [1, 0], [1, 1], [0, 0]]])
        variations['/calls/GT'] = gts
        variations._create_matrix('/variations/alt', shape=(1, 1),
                                  dtype=numpy.int16, fillvalue=0)
        expected = numpy.array([[1.25825397e+01, 1.85240619e-03],
                                [1.25825397e+01, 1.85240619e-03]])
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert numpy.allclose(result, expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test1 = calc_hwe_chi2_test(hdf5, chunk_size=None)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test2 = calc_hwe_chi2_test(hdf5)
        assert numpy.allclose(hwe_test1, hwe_test2, equal_nan=True)
Beispiel #30
0
    def test_merge_variations(self):
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001',
                                  b'NA00002', b'NA00003']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                try:
                    assert numpy.all(expected_h5[field][:] == result)
                except AssertionError:
                    print(field)
                    print(expected_h5[field][:])
                    print(result)

        # Change the order
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'NA00001', b'NA00002', b'NA00003',
                                  b'TS-1', b'TS-11', b'TS-21']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                assert numpy.all(expected_h5[field][:] == result)
Beispiel #31
0
def _filter_samples_by_index(variations,
                             sample_cols,
                             filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.dtype(bool))
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [
        samples[idx] for idx, keep in enumerate(sample_cols) if keep
    ]
    filtered_vars.samples = kept_samples
    return filtered_vars
Beispiel #32
0
 def test_write_meta_header(self):
     files = [
         'format_def_without_info.vcf', 'format_def_without_filter.vcf',
         'format_without_flt_info_qual.vcf'
     ]
     for file in files:
         vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb')
         header_lines = [
             line for line in vcf_fhand if line.startswith(b'#')
         ]
         vcf_fhand.close()
         with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand:
             vcf = VCFParser(vcf_fhand)
             var_array = VariationsArrays(ignore_undefined_fields=True)
             var_array.put_vars(vcf)
             with NamedTemporaryFile(suffix='.h5') as tmp_fhand:
                 _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0')
                 _write_vcf_header(var_array, tmp_fhand)
                 tmp_fhand.flush()
                 with open(tmp_fhand.name, 'rb') as retmp_fhand:
                     for line in retmp_fhand:
                         assert line in header_lines
Beispiel #33
0
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
Beispiel #34
0
    def test_excel(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]],
                           [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                           [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts
        variations.samples = list(range(gts.shape[1]))

        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # chrom pos
        variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2])
        variations[POS_FIELD] = numpy.array([10, 20, 10, 20])
        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # REF, ALT
        variations[REF_FIELD] = numpy.array(['A', 'A', 'A', 'A'])
        variations[ALT_FIELD] = numpy.array([['T'], ['T'], ['T'], ['T']])
        write_excel(variations, fhand)
Beispiel #35
0
 def test_expected_het(self):
     gts = [[[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 0]],
            [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]],
            [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]]]
     snps = VariationsArrays()
     snps['/calls/GT'] = numpy.array(gts)
     exp = [0.5, 0.48979592, 0.48979592]
     assert numpy.allclose(calc_expected_het(snps, min_num_genotypes=0),
                           exp)
     exp = [0.53846154, 0.52747253, 0.52747253]
     assert numpy.allclose(calc_unbias_expected_het(snps,
                                                    min_num_genotypes=0),
                           exp)
Beispiel #36
0
 def test_vcf_detect_fields(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual'])
     vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual'])
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     metadata = snps.metadata
     snps2 = VariationsArrays(ignore_undefined_fields=True)
     snps2.put_vars(vcf2)
     metadata2 = snps2.metadata
     assert '/calls/HQ' in metadata.keys()
     assert '/variations/qual' not in metadata2.keys()
     vcf_fhand.close()
     vcf_fhand2.close()
Beispiel #37
0
    def test_maf(self):
        gts = numpy.array([])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        mafs = calc_maf(varis, chunk_size=None)
        assert mafs.shape == (0,)
        mafs = calc_maf(varis)
        assert mafs.shape == (0,)

        mafs = calc_mac(varis, chunk_size=None)
        assert mafs.shape == (0,)
        mafs = calc_mac(varis)
        assert mafs.shape == (0,)

        gts = numpy.array([[[0], [0], [0], [0]], [[0], [0], [1], [1]],
                           [[0], [0], [0], [1]], [[-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        mafs = calc_maf(varis, min_num_genotypes=1)
        assert numpy.allclose(mafs, numpy.array([1., 0.5, 0.75, numpy.NaN]),
                              equal_nan=True)

        macs = calc_mac(varis, min_num_genotypes=1)
        assert numpy.allclose(macs, numpy.array([4, 2, 3, numpy.NaN]),
                              equal_nan=True)

        varis = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        mafs = calc_maf(varis)
        assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] >= 0.5)
        assert numpy.all(mafs[numpy.logical_not(numpy.isnan(mafs))] <= 1)
        assert mafs.shape == (943,)

        macs = calc_mac(varis)
        # assert macs.shape == (943,)
        min_mac = varis['/calls/GT'].shape[1] / 2
        max_mac = varis['/calls/GT'].shape[1]
        assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] >= min_mac)
        assert numpy.all(macs[numpy.logical_not(numpy.isnan(mafs))] <= max_mac)
Beispiel #38
0
    def test_low_dp_gt(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = LowDPGTsToMissingSetter(min_dp=5)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Beispiel #39
0
    def test_biallelic(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = NonBiallelicFilter()
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Beispiel #40
0
    def test_fieldpath(self):
        pipeline = Pipeline()
        annot_id = 'test'
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        field = '/variations/info/{}'.format(annot_id)
        annotator = IsVariableAnnotator(annot_id=annot_id,
                                        samples=['1_14_1_gbs', '1_17_1_gbs'])
        pipeline.append(annotator)
        annotator = FieldValueFilter(field_path=field, value=0)
        pipeline.append(annotator)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)
        assert vars_out.num_variations == 484
Beispiel #41
0
    def test_set_to_missing(self):
        orig_vars = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        vars = copy_setting_gts_to_missing(orig_vars, gt_rate_to_missing=0.9)

        orig_gts = orig_vars[GT_FIELD][...]
        noise_gts = vars[GT_FIELD]
        assert orig_gts.shape == noise_gts.shape
        mask_different_gts = orig_gts != noise_gts
        expected_num_gts_set_to_missing = int(
            round(numpy.sum(orig_gts != MISSING_INT) * 0.9))
        assert expected_num_gts_set_to_missing == mask_different_gts.sum()
        assert not numpy.sum(orig_gts[mask_different_gts] == MISSING_INT)

        vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf = VCFParser(vcf_fhand)
        snps = VariationsArrays(ignore_undefined_fields=True)
        snps.put_vars(vcf)
        vcf_fhand.close()
        numpy.random.seed(1)
        gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[0, 0], [0, 1], [-1,
                                                                        0]],
                           [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0],
                                                        [1, 0]],
                           [[0, 1], [-1, 2], [1, 1]]])

        expected_gts = numpy.array([[[0, 1], [1, 0], [-1, 1]],
                                    [[-1, -1], [0, 1], [-1, 0]],
                                    [[-1, 2], [2, 1], [-1, 2]],
                                    [[0, 0], [-1, 0], [-1, -1]],
                                    [[-1, -1], [-1, 2], [-1, -1]]])
        del snps[GT_FIELD]
        snps[GT_FIELD] = gts
        vars = copy_setting_gts_to_missing(snps, gt_rate_to_missing=0.5)

        noise_gts = vars[GT_FIELD]
        assert numpy.all(noise_gts == expected_gts)
Beispiel #42
0
    def test_kosman_pairwise_between_pops_by_chunk(self):
        a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1],
                         [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                         [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        gts = numpy.stack((a, b, c, d), axis=0)
        gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16)
        variations = VariationsArrays()
        variations.samples = [1, 2, 3, 4]
        variations['/calls/GT'] = gts
        expected = [[0., 0.33333333, 0.75, 0.75], [0.33333333, 0., 0.45, 0.45],
                    [0.75, 0.45, 0., 0.], [0.75, 0.45, 0., 0.]]
        distance = calc_pairwise_distances_between_pops(
            variations,
            chunk_size=None,
            min_num_snps=1,
            pop1_samples=[1, 2, 3, 4],
            pop2_samples=[1, 2, 3, 4])
        assert numpy.allclose(distance, expected)

        expected = [[0., 0.33333333, 0.75, 0.75]]
        distance = calc_pairwise_distances_between_pops(
            variations,
            chunk_size=None,
            min_num_snps=1,
            pop1_samples=[1],
            pop2_samples=[1, 2, 3, 4])
        assert numpy.allclose(distance, expected)

        expected = [[0.75, 0.75], [0.45, 0.45]]
        distance = calc_pairwise_distances_between_pops(variations,
                                                        chunk_size=None,
                                                        min_num_snps=1,
                                                        pop1_samples=[1, 2],
                                                        pop2_samples=[3, 4])
Beispiel #43
0
    def test_filter_samples(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        samples = hdf5.samples[:20]
        flt = SampleFilter(samples)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Beispiel #44
0
    def test_calc_maf_depth_distribs_per_sample(self):
        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([])
        variations['/calls/RO'] = numpy.array([])
        distribs, bins = calc_maf_depth_distribs_per_sample(variations,
                                                            chunk_size=None)
        assert distribs is None
        assert bins is None

        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([[[0, 0], [0, 0], [15, -1]]])
        variations['/calls/RO'] = numpy.array([[10, 5, 15]])
        variations.samples = list(range(3))
        distribs, _ = calc_maf_depth_distribs_per_sample(variations, n_bins=4,
                                                         min_depth=6,
                                                         chunk_size=None)
        expected = [[0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 1, 0]]
        assert numpy.all(distribs == expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        distribs1, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6,
                                                          chunk_size=None)
        distribs2, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6)
        assert numpy.all(distribs1 == distribs2)
Beispiel #45
0
    def test_calc_called_gts_distribution_per_depth(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        dist, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                    chunk_size=10)
        assert dist[1, 1] == 1
        dist2, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                     chunk_size=None)
        assert numpy.all(dist == dist2)

        vars_ = VariationsArrays()
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [0, 1],
                                           [0, 0], [0, 1], [0, 0],
                                           [0, 0], [0, 1], [1, 1],
                                           [0, 0]]])
        vars_['/calls/DP'] = numpy.array([[10, 5, 15, 7, 10,
                                          0, 0, 25, 20, 10]])
        vars_.samples = list(range(10))
        dist, _ = calc_called_gts_distrib_per_depth(vars_, depths=[0, 5, 10,
                                                                   30])
        expected = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(dist == expected)
    def test_calc_called_gts_distribution_per_depth(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        dist, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                    chunk_size=10)
        assert dist[1, 1] == 1
        dist2, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30),
                                                     chunk_size=None)
        assert numpy.all(dist == dist2)

        vars_ = VariationsArrays()
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [0, 1],
                                           [0, 0], [0, 1], [0, 0],
                                           [0, 0], [0, 1], [1, 1],
                                           [0, 0]]])
        vars_['/calls/DP'] = numpy.array([[10, 5, 15, 7, 10,
                                          0, 0, 25, 20, 10]])
        vars_.samples = list(range(10))
        dist, _ = calc_called_gts_distrib_per_depth(vars_, depths=[0, 5, 10,
                                                                   30])
        expected = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(dist == expected)
    def test_calc_maf_depth_distribs_per_sample(self):
        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([])
        variations['/calls/RO'] = numpy.array([])
        distribs, bins = calc_maf_depth_distribs_per_sample(variations,
                                                            chunk_size=None)
        assert distribs is None
        assert bins is None

        variations = VariationsArrays()
        variations['/calls/AO'] = numpy.array([[[0, 0], [0, 0], [15, -1]]])
        variations['/calls/RO'] = numpy.array([[10, 5, 15]])
        variations.samples = list(range(3))
        distribs, _ = calc_maf_depth_distribs_per_sample(variations, n_bins=4,
                                                         min_depth=6,
                                                         chunk_size=None)
        expected = [[0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 1, 0]]
        assert numpy.all(distribs == expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        distribs1, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6,
                                                          chunk_size=None)
        distribs2, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6)
        assert numpy.all(distribs1 == distribs2)
Beispiel #48
0
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None,
                             reverse=False):
    if filtered_vars is None:
        filtered_vars = VariationsArrays()

    samples = variations.samples
    try:
        dtype = sample_cols.dtype
        is_bool = numpy.issubdtype(dtype, numpy.bool)
    except AttributeError:
        item = first(iter(sample_cols))
        is_bool = isinstance(item, bool)
    if not is_bool:
        sample_cols = [idx in sample_cols for idx in range(len(samples))]

    if 'shape' not in dir(sample_cols):
        sample_cols = numpy.array(sample_cols, dtype=numpy.bool)

    if reverse:
        sample_cols = numpy.logical_not(sample_cols)

    for path in variations.keys():
        matrix = variations[path]
        if is_dataset(matrix):
            matrix = matrix[:]
        if 'calls' in path:
            flt_data = matrix[:, sample_cols]
            # flt_data = numpy.compress(sample_cols, , axis=1)
            filtered_vars[path] = flt_data
        else:
            filtered_vars[path] = matrix
    filtered_vars.metadata = variations.metadata
    kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols)
                    if keep]
    filtered_vars.samples = kept_samples
    return filtered_vars
Beispiel #49
0
    def test_snp_qual(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = SNPQualFilter(min_qual=100, max_qual=50000, do_histogram=True)
        pipeline.append(flt)

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['0']['counts'], result2['counts'])
        assert numpy.allclose(result['0']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Beispiel #50
0
 def test_kosman_pairwise(self):
     a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1],
                      [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
     b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                      [1, 0], [1, 0], [1, 0], [0, 1], [1, 2]])
     c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
     d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
     gts = numpy.stack((a, b, c, d), axis=0)
     gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16)
     varis = VariationsArrays()
     varis[GT_FIELD] = gts
     pairwise_dist_calculator = _IndiPairwiseCalculator()
     abs_dist, n_snps = pairwise_dist_calculator.calc_dist(varis,
                                                           method='kosman')
     distance = abs_dist / n_snps
     expected = [0.33333333, 0.75, 0.75, 0.5, 0.5, 0.]
     assert numpy.allclose(distance, expected)
Beispiel #51
0
    def test_calc_r2_windows(self):
        variations = VariationsArrays()
        chrom = numpy.array([b'chr1'] * 4)
        pos = numpy.array([1, 4, 6, 20])
        gts = numpy.array([[[0, 0], [1, 1], [0, 0]],
                           [[0, 0], [1, 1], [0, 0]],
                           [[1, 1], [0, 0], [1, 1]],
                           [[0, 0], [0, 1], [-1, -1]]])
        variations['/variations/chrom'] = chrom
        variations['/variations/pos'] = pos
        variations['/calls/GT'] = gts
        expected = [1.0, 1.0000002, 1.0, 1.0000002, 1.0, 1.0]
        assert numpy.allclose(_calc_r2(gts), expected)

        chrom, pos, r2 = calc_r2_windows(variations, 10)
        assert numpy.allclose(r2, [1.0000002384185933, numpy.nan],
                              equal_nan=True)
        assert numpy.all(chrom == b'chr1')
Beispiel #52
0
    def test_empty_pop(self):
        missing = (-1, -1)
        gts = [
            [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.allclose(dists, [0.65490196])

        gts = [
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.isnan(dists[0])
Beispiel #53
0
 def test_vcf_detect_fields(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, pre_read_max_size=1000,
                     kept_fields=['/variations/qual'])
     vcf2 = VCFParser(vcf_fhand2, pre_read_max_size=1000,
                      ignored_fields=['/variations/qual'])
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     metadata = snps.metadata
     snps2 = VariationsArrays(ignore_undefined_fields=True)
     snps2.put_vars(vcf2)
     metadata2 = snps2.metadata
     assert '/calls/HQ' in metadata.keys()
     assert '/variations/qual' not in metadata2.keys()
     vcf_fhand.close()
     vcf_fhand2.close()
Beispiel #54
0
    def test_put_vars_from_csv(self):
        fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           gt_splitter=create_iupac_allele_splitter(),
                           max_field_lens={'alt': 1},
                           max_field_str_lens={'alt': 1, 'chrom': 20,
                                               'ref': 1})

        with NamedTemporaryFile(suffix='.h5') as fhand:
            os.remove(fhand.name)
            h5 = VariationsH5(fhand.name, mode='w', ignore_overflows=True,
                              ignore_undefined_fields=True)
            h5.put_vars(parser)
            exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
            assert list(h5['/variations/chrom'][:]) == exp
            alleles = list(zip(h5['/variations/ref'],
                           [alts[0] for alts in h5['/variations/alt']]))
            exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
            for als, aexp in zip(alleles, exp):
                assert set(als) == set(aexp)
            assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                      1511764]
            exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [1, 0]]])
            exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[1, 1], [1, 1], [0, 1]]])

            for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
                for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                    assert set(gt) == set(ex1) or set(gt) == set(ex2)

        if os.path.exists(fhand.name):
            os.remove(fhand.name)
        fhand_ex.close()

        fhand_ex = open(join(TEST_DATA_DIR, 'csv',
                             'two_letter_coding_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           max_field_lens={'alt': 1},
                           max_field_str_lens={'alt': 1, 'chrom': 20,
                                               'ref': 1})

        h5 = VariationsArrays(ignore_overflows=True,
                              ignore_undefined_fields=True)
        h5.put_vars(parser)
        exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
        assert list(h5['/variations/chrom'][:]) == exp
        alleles = list(zip(h5['/variations/ref'],
                       [alts[0] for alts in h5['/variations/alt']]))
        exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
        for als, aexp in zip(alleles, exp):
            assert set(als) == set(aexp)
        assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                  1511764]
        exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [1, 0]]])
        exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[1, 1], [1, 1], [0, 1]]])
        for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
            for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                assert set(gt) == set(ex1) or set(gt) == set(ex2)
        fhand_ex.close()
    def test_create_hdf5_with_chunks(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        out_fhand = NamedTemporaryFile(suffix='.hdf5')
        out_fpath = out_fhand.name
        out_fhand.close()
        hdf5_2 = VariationsH5(out_fpath, 'w')
        try:
            hdf5_2.put_chunks(hdf5.iterate_chunks())
            assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ']
            assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:])
        finally:
            os.remove(out_fpath)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        out_fhand = NamedTemporaryFile(suffix='.hdf5')
        out_fpath = out_fhand.name
        out_fhand.close()
        hdf5_2 = VariationsH5(out_fpath, 'w')
        try:
            hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
            assert list(hdf5_2['calls'].keys()) == ['GT']
            assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:])
        finally:
            os.remove(out_fpath)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hdf5_2 = VariationsArrays()
        hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2))
        _, prob = scipy.stats.ttest_ind(hdf5['/variations/pos'][:],
                                        hdf5_2['/variations/pos'][:])
        assert prob > 0.05
        assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1
        chrom = hdf5_2['/variations/chrom'][0]
        pos = hdf5_2['/variations/pos'][0]
        index = PosIndex(hdf5)
        idx = index.index_pos(chrom, pos)
        old_snp = hdf5['/calls/GT'][idx]
        new_snp = hdf5_2['/calls/GT'][0]
        assert numpy.all(old_snp == new_snp)

        # putting empty chunks
        hdf5_2.put_chunks(None)
        hdf5_2.put_chunks([])
        chunk = hdf5.get_chunk(slice(1000, None))
        hdf5_2.put_chunks([chunk])

        old_snp = hdf5['/calls/DP'][idx]
        new_snp = hdf5_2['/calls/DP'][0]
        assert numpy.all(old_snp == new_snp)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        hdf5_2 = VariationsArrays()
        hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0))
        assert hdf5_2.num_variations == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hdf5_3 = VariationsArrays()
        hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
Beispiel #56
0
def stats_missing_rate_from_hdf5_memory():
    fpath = join(TEST_DATA_DIR, 'performance', 'inca_torvum_all_snps.h5')
    var_mat = VariationsH5(fpath, mode='r')
    array = VariationsArrays()
    array.put_chunks(var_mat.iterate_chunks(kept_fields=['/calls/GT']))
    calc_stat_by_chunk(array, _MissingGTCalculator())
Beispiel #57
0
    def test_write_vcf(self):
        # With all fields available
        tmp_fhand = NamedTemporaryFile()
        tmp_fhand.close()
        vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_exp.vcf'), 'rb')
        vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2},
                        pre_read_max_size=10000)

        max_field_lens = {'CALLS': {b'GT': 1, b'HQ': 2, b'DP': 1, b'GQ': 1},
                          'FILTER': 1,
                          'INFO': {b'AA': 1, b'AF': 2, b'DP': 1,
                                   b'DB': 1, b'NS': 1, b'H2': 1}, 'alt': 2}
        max_field_str_lens = {'INFO': {b'AA': 1}, 'alt': 5, 'chrom': 2, 'ref': 4,
                              'id': 10, 'FILTER': 0}

        variations = VariationsArrays(ignore_undefined_fields=True)
        variations.put_vars(vcf, max_field_lens=max_field_lens,
                            max_field_str_lens=max_field_str_lens)
        vcf_fhand.close()
        with NamedTemporaryFile(mode='wb') as out_fhand:
            write_vcf(variations, out_fhand, vcf_format='VCFv4.0')
            vcf_fpath = join(TEST_DATA_DIR, 'format_def_exp.vcf')
            with open(vcf_fpath, 'r') as exp_fhand:
                exp_lines = list(exp_fhand)
                out_fhand.seek(0)
                with open(out_fhand.name) as refhand:
                    for line in refhand:
                        try:
                            assert line in exp_lines
                        except AssertionError:
                            print('aa', line)

        # With missing info in variations
        tmp_fhand = NamedTemporaryFile()
        out_fpath = tmp_fhand.name
        tmp_fhand.close()
        vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_info.vcf'),
                         'rb')
        vcf = VCFParser(vcf_fhand)

        max_field_lens = {'INFO': {}, 'CALLS': {b'GQ': 1, b'GT': 1, b'HQ': 2,
                                                b'DP': 1},
                          'FILTER': 1, 'alt': 2}
        max_field_str_lens = {'ref': 4, 'INFO': {}, 'id': 10, 'FILTER': 0,
                              'alt': 5, 'chrom': 2}

        h5_without_info = VariationsH5(fpath=out_fpath, mode='w',
                                       ignore_undefined_fields=True)
        h5_without_info.put_vars(vcf, max_field_lens=max_field_lens,
                                 max_field_str_lens=max_field_str_lens)
        vcf_fhand.close()
        with NamedTemporaryFile(mode='wb') as out_fhand:
            write_vcf(h5_without_info, out_fhand, vcf_format='VCFv4.0')
            vcf_fpath = join(TEST_DATA_DIR, 'format_def_without_info_exp.vcf')
            with open(vcf_fpath, 'r') as exp_fhand:
                exp_lines = list(exp_fhand)
                out_fhand.seek(0)
                with open(out_fhand.name) as refhand:
                    for line in refhand:
                        try:
                            assert line in exp_lines
                        except AssertionError:
                            print(line)