Beispiel #1
0
    def test_calculate_hwe(self):
        variations = VariationsArrays()
        gts = numpy.array([])
        variations['/calls/GT'] = gts
        variations['/variations/alt'] = gts
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert result.shape[0] == 0

        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [0, 1], [1, 1], [0, 0]],
                           [[0, 0], [1, 0], [0, 1], [0, 0], [0, 1], [0, 0],
                            [0, 0], [1, 0], [1, 1], [0, 0]]])
        variations['/calls/GT'] = gts
        variations._create_matrix('/variations/alt', shape=(1, 1),
                                  dtype=numpy.int16, fillvalue=0)
        expected = numpy.array([[1.25825397e+01, 1.85240619e-03],
                                [1.25825397e+01, 1.85240619e-03]])
        result = calc_hwe_chi2_test(variations, min_num_genotypes=0,
                                    chunk_size=None)
        assert numpy.allclose(result, expected)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test1 = calc_hwe_chi2_test(hdf5, chunk_size=None)
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hwe_test2 = calc_hwe_chi2_test(hdf5)
        assert numpy.allclose(hwe_test1, hwe_test2, equal_nan=True)
Beispiel #2
0
 def test_sort_variations(self):
     fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb')
     var_info = {
         b'solcap_snp_sl_15058': {
             'chrom': b'chrom2',
             'pos': 345
         },
         b'solcap_snp_sl_60635': {
             'chrom': b'chrom1',
             'pos': 346
         },
         b'solcap_snp_sl_60604': {
             'chrom': b'chrom1',
             'pos': 325
         }
     }
     parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t')
     variations = VariationsArrays(ignore_undefined_fields=True)
     variations.put_vars(parser)
     sorted_vars = VariationsArrays()
     sort_variations(variations, sorted_vars)
     exp_chrom = [b'chrom1', b'chrom1', b'chrom2']
     exp_pos = [325, 346, 345]
     assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom)
     assert numpy.all(sorted_vars['/variations/pos'] == exp_pos)
     fhand.close()
Beispiel #3
0
    def test_matching_pairwise_by_chunk(self):
        a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1],
                         [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                         [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16)
        gts = numpy.stack((a, b, c, d), axis=0)
        gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16)
        variations = VariationsArrays()
        variations['/calls/GT'] = gts
        expected = [0.444444, 0, 0, 0.3, 0.3, 1]
        distance = calc_pairwise_distance(variations,
                                          chunk_size=None,
                                          method='matching')
        assert numpy.allclose(distance, expected)

        distance = calc_pairwise_distance(variations,
                                          chunk_size=2,
                                          method='matching')
        assert numpy.allclose(distance, expected)

        # With all missing
        a = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16)
        b = numpy.full(shape=(10, 2), fill_value=-1, dtype=numpy.int16)
        gts = numpy.stack((a, b), axis=0)
        gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16)
        variations = VariationsArrays()
        variations['/calls/GT'] = gts
        distance = calc_pairwise_distance(variations, method='matching')
        assert numpy.isnan(distance[0])
Beispiel #4
0
    def test_allele_observation_based_maf(self):
        allele_depths = numpy.array([])
        varis = VariationsArrays()
        varis[AD_FIELD] = allele_depths
        maf = calc_allele_observation_based_maf(varis, chunk_size=None)
        assert not list(maf)

        allele_depths_snp1 = [[10, 0, 1], # Allele Obervation in sample1
                              [4, 6, 1]] # Allele Obervation in sample2
        allele_depths_snp2 = [[10, 0, 0], # Allele Obervation in sample1
                              [0, 5, 7]] # Allele Obervation in sample2
        allele_depths_snp3 = [[-1, -1, -1], # Allele Obervation in sample1
                              [-1, -1, -1]] # Allele Obervation in sample2

        allele_depths = numpy.array([allele_depths_snp1,
                                     allele_depths_snp2,
                                     allele_depths_snp3])
        varis = VariationsArrays()
        varis[AD_FIELD] = allele_depths
        maf = calc_allele_observation_based_maf(varis, chunk_size=None)
        expected = [0.63636364, 0.45454545, numpy.nan]
        assert numpy.allclose(maf, expected, equal_nan=True)

        maf = calc_allele_observation_based_maf(varis, chunk_size=1)
        expected = [0.63636364, 0.45454545, numpy.nan]
        assert numpy.allclose(maf, expected, equal_nan=True)
Beispiel #5
0
    def test_create_hdf5_with_chunks(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        out_fhand = NamedTemporaryFile(suffix='.hdf5')
        out_fpath = out_fhand.name
        out_fhand.close()
        hdf5_2 = VariationsH5(out_fpath, 'w')
        try:
            hdf5_2.put_chunks(hdf5.iterate_chunks())
            assert sorted(hdf5_2['calls'].keys()) == ['DP', 'GQ', 'GT', 'HQ']
            assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:])
        finally:
            os.remove(out_fpath)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        out_fhand = NamedTemporaryFile(suffix='.hdf5')
        out_fpath = out_fhand.name
        out_fhand.close()
        hdf5_2 = VariationsH5(out_fpath, 'w')
        try:
            hdf5_2.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
            assert list(hdf5_2['calls'].keys()) == ['GT']
            assert numpy.all(hdf5['/calls/GT'][:] == hdf5_2['/calls/GT'][:])
        finally:
            os.remove(out_fpath)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hdf5_2 = VariationsArrays()
        hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.2))
        _, prob = ttest_ind(hdf5['/variations/pos'][:],
                            hdf5_2['/variations/pos'][:])
        assert prob > 0.05
        assert hdf5_2.num_variations / hdf5.num_variations - 0.2 < 0.1
        chrom = hdf5_2['/variations/chrom'][0]
        pos = hdf5_2['/variations/pos'][0]
        index = PosIndex(hdf5)
        idx = index.index_pos(chrom, pos)
        old_snp = hdf5['/calls/GT'][idx]
        new_snp = hdf5_2['/calls/GT'][0]
        assert numpy.all(old_snp == new_snp)

        # putting empty chunks
        hdf5_2.put_chunks(None)
        hdf5_2.put_chunks([])
        chunk = hdf5.get_chunk(slice(1000, None))
        hdf5_2.put_chunks([chunk])

        old_snp = hdf5['/calls/DP'][idx]
        new_snp = hdf5_2['/calls/DP'][0]
        assert numpy.all(old_snp == new_snp)

        hdf5 = VariationsH5(join(TEST_DATA_DIR, '1000snps.hdf5'), mode='r')
        hdf5_2 = VariationsArrays()
        hdf5_2.put_chunks(hdf5.iterate_chunks(random_sample_rate=0))
        assert hdf5_2.num_variations == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        hdf5_3 = VariationsArrays()
        hdf5_3.put_chunks(hdf5.iterate_chunks(random_sample_rate=0.01))
Beispiel #6
0
    def test_calc_distrib_for_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks())
        distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP',
                                                     sample='1_17_1_gbs',
                                                     n_bins=15)
        assert distrib.shape == (15,)

        distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=None)
        assert numpy.all(distrib == distrib2)

        distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP',
                                                      n_bins=15,
                                                      sample='1_17_1_gbs',
                                                      chunk_size=50)
        assert numpy.all(distrib3 == distrib2)

        vars_ = VariationsArrays()
        vars_['/calls/DP'] = numpy.array([[10, 5, 15],
                                          [0, 15, 10]])
        vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]],
                                          [[0, 0], [0, 1], [1, 1]]])
        vars_.samples = list(range(3))
        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_het)
        expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])

        distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP',
                                                    n_bins=16,
                                                    mask_field='/calls/GT',
                                                    mask_func=call_is_hom)
        expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
        assert numpy.all(expec == distrib)
        assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
Beispiel #7
0
    def test_by_chunks(self):
        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays()
        snps.put_vars(vcf_parser)
        fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays(vars_in_chunk=1)
        snps.put_vars(vcf_parser)
        fhand.close()
Beispiel #8
0
    def test_num_private_alleles(self):
        stat_funct = partial(calc_number_of_private_alleles,
                             min_num_genotypes=0)

        gts = numpy.array([[[0], [0], [0], [0], [-1]],
                           [[0], [0], [1], [1], [-1]],
                           [[0], [2], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 1, 0], 2: [0, 1, 1, 0]}

        self._check_function(stat_funct, varis, pops, expected)

        # No missing alleles
        gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1],
                                                       [1]],
                           [[0], [2], [0], [1], [1]], [[1], [1], [0], [0],
                                                       [2]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 1, 1], 2: [1, 1, 1, 2]}
        self._check_function(stat_funct, varis, pops, expected)

        # all missing
        gts = numpy.array([[[0], [0], [0], [-1], [-1]],
                           [[0], [0], [1], [-1], [-1]],
                           [[0], [2], [-1], [-1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 2, 0], 2: [0, 1, 0, 0]}
        self._check_function(stat_funct, varis, pops, expected)

        # min_num_genotypes
        stat_funct = partial(calc_number_of_private_alleles,
                             min_num_genotypes=2)
        gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1],
                                                       [1]],
                           [[0], [2], [0], [1], [1]], [[1], [-1], [0], [0],
                                                       [2]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 1, 1, 0], 2: [1, 1, 1, 0]}
        self._check_function(stat_funct, varis, pops, expected)
Beispiel #9
0
    def test_num_alleles(self):
        stat_funct = partial(calc_number_of_alleles, min_num_genotypes=0)

        gts = numpy.array([[[0], [0], [0], [0], [-1]],
                           [[0], [0], [1], [1], [-1]],
                           [[0], [0], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [1, 1, 1, 0], 2: [1, 1, 2, 0]}

        self._check_function(stat_funct, varis, pops, expected)

        # a population empty
        gts = numpy.array([[[-1], [-1], [0], [0], [-1]],
                           [[-1], [-1], [1], [1], [-1]],
                           [[-1], [-1], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0, 0, 0, 0], 2: [1, 1, 2, 0]}
        self._check_function(stat_funct, varis, pops, expected)

        # only one pop
        gts = numpy.array([[[1], [-1], [0], [0], [-1]],
                           [[-1], [-1], [1], [1], [-1]],
                           [[-1], [-1], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2]}
        expected = {1: [1, 0, 0, 0]}
        self._check_function(stat_funct, varis, pops, expected)

        # min num genotypes
        stat_funct = partial(calc_number_of_alleles, min_num_genotypes=3)
        gts = numpy.array([[[1], [-1], [0], [0], [-1]],
                           [[-1], [-1], [1], [1], [-1]],
                           [[-1], [-1], [0], [1], [-1]],
                           [[-1], [-1], [-1], [-1], [-1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2, 3, 4, 5]}
        expected = {1: [2, 0, 0, 0]}
        self._check_function(stat_funct, varis, pops, expected)
Beispiel #10
0
    def test_empty_pop(self):
        missing = (-1, -1)
        gts = [
            [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
             (2, 4), (4, 4), (-1, -1)],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.allclose(dists, [0.65490196])

        gts = [
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
            [
                missing, missing, missing, missing, missing, (3, 2), (3, 4),
                (2, 2), (2, 4), (4, 4), (-1, -1)
            ],
        ]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.isnan(dists[0])
Beispiel #11
0
    def test_pipeline(self):
        pipeline = Pipeline()
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        flt = MinCalledGTsFilter(min_called=0.1, range_=(0, 1))
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        # check same result with no pipeline
        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
        assert (
            result['filter1'][FLT_STATS][N_KEPT] == result2[FLT_STATS][N_KEPT])
        assert result['filter1'][FLT_STATS][TOT] == result2[FLT_STATS][TOT]
        assert (result['filter1'][FLT_STATS][N_FILTERED_OUT] ==
                result2[FLT_STATS][N_FILTERED_OUT])

        # check with no range set
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=0.1, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])

        # With rates False
        pipeline = Pipeline()
        flt = MinCalledGTsFilter(min_called=20, rates=False, do_histogram=True)
        pipeline.append(flt, id_='filter1')

        vars_out = VariationsArrays()
        result = pipeline.run(hdf5, vars_out)

        result2 = flt(hdf5)
        assert result['filter1']['order'] == 0
        assert numpy.allclose(result['filter1']['counts'], result2['counts'])
        assert numpy.allclose(result['filter1']['edges'], result2['edges'])
        assert numpy.allclose(vars_out['/calls/GT'],
                              result2[FLT_VARS]['/calls/GT'])
Beispiel #12
0
 def test_vcf_detect_fields(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual'])
     vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual'])
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     metadata = snps.metadata
     snps2 = VariationsArrays(ignore_undefined_fields=True)
     snps2.put_vars(vcf2)
     metadata2 = snps2.metadata
     assert '/calls/HQ' in metadata.keys()
     assert '/variations/qual' not in metadata2.keys()
     vcf_fhand.close()
     vcf_fhand2.close()
Beispiel #13
0
    def test_nei_dist(self):

        gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                           [[1, 1], [1, 2], [2, 2], [2, 1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1,
                                                      chunk_size=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        # all missing
        gts = numpy.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isnan(dists[0])

        # min_num_genotypes
        gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]],
                           [[1, 1], [1, 2], [2, 2], [2, 1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        pops = [[1, 2], [3, 4]]
        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      min_num_genotypes=1)
        assert math.isclose(dists[0], 0.3726315908494797)

        dists = _calc_pop_pairwise_unbiased_nei_dists(varis,
                                                      populations=pops,
                                                      chunk_size=1)
        assert math.isnan(dists[0])
Beispiel #14
0
    def test_parse_bam(self):
        bam_fpath = join(TEST_DATA_DIR, 'example.rg.bam')
        parser = BAMParser([bam_fpath],
                           kmer_size=4,
                           ploidy=2,
                           min_num_samples=2,
                           max_field_lens={
                               'alt': 1,
                               'CALLS': {
                                   b'AD': 3
                               }
                           },
                           max_field_str_lens={'chrom': 20})

        snps = VariationsArrays(ignore_undefined_fields=True)
        snps.put_vars(parser)
        assert snps.ploidy
        assert list(snps.chroms) == ['ref']
        assert snps.num_variations == 4
        assert len(snps[REF_FIELD]) == 4
        assert len(snps[REF_FIELD][0]) == 4

        assert list(snps[CHROM_FIELD]) == ['ref', 'ref', 'ref', 'ref']
        assert list(snps[POS_FIELD]) == [15, 16, 17, 36]
        assert AD_FIELD in snps
        assert GT_FIELD in snps
Beispiel #15
0
    def test_calc_obs_het_sample(self):
        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        het_h5 = calc_obs_het_by_sample(hdf5)
        het_array = calc_obs_het_by_sample(snps)
        assert numpy.all(het_array == het_h5)

        gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1]]])

        varis = {'/calls/GT': gts}
        het = calc_obs_het_by_sample(varis, chunk_size=None)
        assert numpy.allclose(het, [0, 1 / 3, 0, numpy.NaN], equal_nan=True)

        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        het = calc_obs_het_by_sample(varis, chunk_size=None)
        assert het.shape[0] == 0

        snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        calc_obs_het_by_sample(snps, min_call_dp=3)
        calc_obs_het_by_sample(snps, min_call_dp=3, max_call_dp=20)
        het_0 = calc_obs_het_by_sample(snps)
        het = calc_obs_het_by_sample(snps, chunk_size=None)
        assert numpy.allclose(het_0, het)
Beispiel #16
0
    def test_calc_obs_het(self):
        stat_funct = calc_obs_het

        gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]],
                           [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        dps = numpy.array([[20, 15, 20, 20, 20], [20, 20, 20, 20, 20],
                           [20, 20, 20, 20, 20]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis[DP_FIELD] = dps
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}
        expected = {1: [0.5, 0, math.nan], 2: [0, 1., math.nan]}
        partial_stat_funct = partial(stat_funct,
                                     min_num_genotypes=1,
                                     min_call_dp=0)
        self._check_function(partial_stat_funct, varis, pops, expected)

        # now setting a depth_threshold
        expected = {1: [0, 0, math.nan], 2: [0, 1., math.nan]}
        partial_stat_funct = partial(stat_funct,
                                     min_call_dp=20,
                                     min_num_genotypes=1)
        self._check_function(partial_stat_funct, varis, pops, expected)
Beispiel #17
0
    def test_report(self):
        gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]],
                           [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4, 5]
        pops = {1: [1, 2], 2: [3, 4, 5]}

        out_dir = tempfile.TemporaryDirectory()
        create_pop_stats_report(varis,
                                pops,
                                out_dir.name,
                                min_num_genotypes=1,
                                min_call_dp_for_obs_het=0,
                                violin_ylimits={
                                    'observed_heterozigosity': {
                                        'bottom': 0,
                                        'top': 0.5
                                    }
                                })
        stats_csv_fpath = os.path.join(out_dir.name, 'pop_stats.csv')
        assert os.path.exists(stats_csv_fpath)
        stats_csv_fpath = os.path.join(out_dir.name,
                                       'pop_stats_violin_plots.svg')
        # input(out_dir.name)
        out_dir.cleanup()
Beispiel #18
0
    def test_calc_obs_het(self):
        gts = numpy.array([])
        dps = numpy.array([])
        varis = {'/calls/GT': gts, '/calls/DP': dps}
        het = calc_obs_het(varis, min_num_genotypes=0)
        assert het.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        snps = VariationsArrays()
        snps.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        het_h5 = calc_obs_het(hdf5, min_num_genotypes=0)
        het_array = calc_obs_het(snps, min_num_genotypes=0)
        assert numpy.all(het_array == het_h5)

        gts = numpy.array([[[0, 0], [0, 1], [0, -1], [-1, -1]],
                           [[0, 0], [0, 0], [0, -1], [-1, -1]]])

        dps = numpy.array([[5, 12, 10, 10],
                           [10, 10, 10, 10]])

        varis = {'/calls/GT': gts, '/calls/DP': dps}
        het = calc_obs_het(varis, min_num_genotypes=0)
        assert numpy.allclose(het, [0.5, 0])

        het = calc_obs_het(varis, min_num_genotypes=10)
        assert numpy.allclose(het, [numpy.NaN, numpy.NaN], equal_nan=True)

        het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=10)
        assert numpy.allclose(het, [1, 0])

        het = calc_obs_het(varis, min_num_genotypes=0, max_call_dp=11)
        assert numpy.allclose(het, [0, 0])

        het = calc_obs_het(varis, min_num_genotypes=0, min_call_dp=5)
        assert numpy.allclose(het, [0.5, 0])
Beispiel #19
0
    def test_calc_missing_gt_rates(self):
        gts = numpy.array([])
        varis = {'/calls/GT': gts}
        called_vars = calc_called_gt(varis, rates=False)
        assert called_vars.shape[0] == 0
        called_vars = calc_called_gt(varis, rates=True)
        assert called_vars.shape[0] == 0

        hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
        arrays = VariationsArrays()
        arrays.put_chunks(hdf5.iterate_chunks(kept_fields=['/calls/GT']))
        rates = calc_missing_gt(arrays)
        rates2 = calc_missing_gt(hdf5)
        assert rates.shape == (943,)
        assert numpy.allclose(rates, rates2)
        assert numpy.min(rates) == 0
        assert numpy.all(rates <= 1)

        gts = numpy.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]],
                           [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]])
        varis = {'/calls/GT': gts}
        expected = numpy.array([2, 1, 1, 0])
        called_vars = calc_called_gt(varis, rates=False)
        assert numpy.all(called_vars == expected)

        missing_vars = calc_missing_gt(varis, rates=False)
        assert numpy.all(missing_vars == 2 - expected)

        expected = numpy.array([0, 0.5, 0.5, 1])
        rates = calc_called_gt(varis)
        assert numpy.allclose(rates, 1 - expected)

        rates = calc_missing_gt(varis)
        assert numpy.allclose(rates, expected)
Beispiel #20
0
    def test_dest_jost_distance(self):

        gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)],
               [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2),
                (2, 4), (4, 4), (-1, -1)]]
        samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
        pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]

        snps = VariationsArrays()
        snps['/calls/GT'] = numpy.array(gts)
        snps.samples = samples

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0)
        assert numpy.allclose(dists, [0.65490196])

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=0,
                                  chunk_size=1)
        assert numpy.allclose(dists, [0.65490196])

        dists = calc_pop_distance(snps,
                                  populations=pops,
                                  method='dest',
                                  min_num_genotypes=6,
                                  chunk_size=1)
        assert numpy.all(numpy.isnan(dists))
Beispiel #21
0
    def test_excel(self):
        variations = VariationsArrays()
        gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]],
                           [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]],
                           [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]],
                           [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts
        variations.samples = list(range(gts.shape[1]))

        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # chrom pos
        variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2])
        variations[POS_FIELD] = numpy.array([10, 20, 10, 20])
        fhand = NamedTemporaryFile(suffix='.xlsx')
        write_excel(variations, fhand)

        # REF, ALT
        variations[REF_FIELD] = numpy.array([b'A', b'A', b'A', b'A'])
        variations[ALT_FIELD] = numpy.array([[b'T'], [b'T'], [b'T'], [b'T']])
        write_excel(variations, fhand)

        # with classifications
        classes = [1, 1, 1, 2, 2]
        write_excel(variations, fhand, classes)
Beispiel #22
0
    def test_iterate_chroms(self):
        fpath = join(TEST_DATA_DIR, 'ril.hdf5')
        hd5 = VariationsH5(fpath, mode='r')
        wins = hd5.iterate_chroms()

        hd5_2 = VariationsArrays()
        hd5_2.put_chunks([win for _, win in wins])
        numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
Beispiel #23
0
def sample_variations(in_vars, sample_rate, out_vars=None, chunk_size=None):
    if out_vars is None:
        out_vars = VariationsArrays()

    chunks = in_vars.iterate_chunks(chunk_size=chunk_size,
                                    random_sample_rate=sample_rate)
    out_vars.put_chunks(chunks)
    return out_vars
Beispiel #24
0
 def test_delete_item_from_variationArray(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     del snps['/calls/GT']
     assert '/calls/GT' not in snps.keys()
     vcf_fhand.close()
Beispiel #25
0
    def test_iterate_wins(self):
        fpath = join(TEST_DATA_DIR, 'ril.hdf5')
        hd5 = VariationsH5(fpath, mode='r')
        wins = hd5.iterate_wins(win_size=1000000)

        hd5_2 = VariationsArrays()
        hd5_2.put_chunks(wins)
        numpy.all(hd5['/variations/pos'] == hd5_2['/variations/pos'])
Beispiel #26
0
 def test_mat012(self):
     gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]],
                        [[0, 0], [0, 0], [1, 1], [2, 2]],
                        [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
     varis = VariationsArrays()
     varis[GT_FIELD] = gts
     gts012 = varis.gts_as_mat012
     expected = [[0, 1, 2, -1], [0, 0, 2, 2], [-1, -1, -1, -1]]
     assert numpy.allclose(gts012, expected, equal_nan=True)
Beispiel #27
0
 def test_put_vars_arrays_from_vcf(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     assert snps['/calls/GT'].shape == (5, 3, 2)
     assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])
     expected = numpy.array([48, 48, 43], dtype=numpy.int16)
     assert numpy.all(snps['/calls/GQ'][0, :] == expected)
     vcf_fhand.close()
Beispiel #28
0
    def test_ignore_non_matching(self):

        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)
        assert new_vars.num_variations == 1
Beispiel #29
0
    def test_genome_chunk(self):
        poss = [5, 7, 8, 10, 11, 12]
        chroms = ['c1'] * len(poss)
        poss = numpy.array(poss)
        chroms = numpy.array(chroms)
        varis = VariationsArrays()
        varis[POS_FIELD] = poss
        varis[CHROM_FIELD] = chroms

        # empty before
        varis.get_genome_chunk('c1', 1, 4)
Beispiel #30
0
    def test_gst_basic(self):
        ad = [[[10, 3, -1], [11, 2, -1]], [[10, 0, -1], [10, 0, -1]],
              [[10, 10, -1], [11, 11, -1]], [[-1, 2, 10], [-1, 10, 2]]]

        snps = VariationsArrays()
        snps.samples = [1, 2]
        populations = [[1], [2]]
        snps[AD_FIELD] = numpy.array(ad)
        dist = calc_gst_per_loci(snps, populations)
        expected = numpy.array([0.00952381, 0, 0, 0.44444444])
        numpy.testing.assert_almost_equal(dist, expected)