def test_num_private_alleles(self): stat_funct = partial(calc_number_of_private_alleles, min_num_genotypes=0) gts = numpy.array([[[0], [0], [0], [0], [-1]], [[0], [0], [1], [1], [-1]], [[0], [2], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 1, 0], 2: [0, 1, 1, 0]} self._check_function(stat_funct, varis, pops, expected) # No missing alleles gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1], [1]], [[0], [2], [0], [1], [1]], [[1], [1], [0], [0], [2]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 1, 1], 2: [1, 1, 1, 2]} self._check_function(stat_funct, varis, pops, expected) # all missing gts = numpy.array([[[0], [0], [0], [-1], [-1]], [[0], [0], [1], [-1], [-1]], [[0], [2], [-1], [-1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 2, 0], 2: [0, 1, 0, 0]} self._check_function(stat_funct, varis, pops, expected) # min_num_genotypes stat_funct = partial(calc_number_of_private_alleles, min_num_genotypes=2) gts = numpy.array([[[0], [0], [0], [0], [1]], [[0], [0], [1], [1], [1]], [[0], [2], [0], [1], [1]], [[1], [-1], [0], [0], [2]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 1, 1, 0], 2: [1, 1, 1, 0]} self._check_function(stat_funct, varis, pops, expected)
def test_empty_pop(self): missing = (-1, -1) gts = [ [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.allclose(dists, [0.65490196]) gts = [ [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.isnan(dists[0])
def test_num_alleles(self): stat_funct = partial(calc_number_of_alleles, min_num_genotypes=0) gts = numpy.array([[[0], [0], [0], [0], [-1]], [[0], [0], [1], [1], [-1]], [[0], [0], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [1, 1, 1, 0], 2: [1, 1, 2, 0]} self._check_function(stat_funct, varis, pops, expected) # a population empty gts = numpy.array([[[-1], [-1], [0], [0], [-1]], [[-1], [-1], [1], [1], [-1]], [[-1], [-1], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0, 0, 0, 0], 2: [1, 1, 2, 0]} self._check_function(stat_funct, varis, pops, expected) # only one pop gts = numpy.array([[[1], [-1], [0], [0], [-1]], [[-1], [-1], [1], [1], [-1]], [[-1], [-1], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2]} expected = {1: [1, 0, 0, 0]} self._check_function(stat_funct, varis, pops, expected) # min num genotypes stat_funct = partial(calc_number_of_alleles, min_num_genotypes=3) gts = numpy.array([[[1], [-1], [0], [0], [-1]], [[-1], [-1], [1], [1], [-1]], [[-1], [-1], [0], [1], [-1]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2, 3, 4, 5]} expected = {1: [2, 0, 0, 0]} self._check_function(stat_funct, varis, pops, expected)
def test_nei_dist(self): gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1, chunk_size=1) assert math.isclose(dists[0], 0.3726315908494797) # all missing gts = numpy.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isnan(dists[0]) # min_num_genotypes gts = numpy.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] pops = [[1, 2], [3, 4]] dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) dists = _calc_pop_pairwise_unbiased_nei_dists(varis, populations=pops, chunk_size=1) assert math.isnan(dists[0])
def test_dest_jost_distance(self): gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = samples dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0) assert numpy.allclose(dists, [0.65490196]) dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=0, chunk_size=1) assert numpy.allclose(dists, [0.65490196]) dists = calc_pop_distance(snps, populations=pops, method='dest', min_num_genotypes=6, chunk_size=1) assert numpy.all(numpy.isnan(dists))
def test_excel(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]], [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]], [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations[GT_FIELD] = gts variations.samples = list(range(gts.shape[1])) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # chrom pos variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2]) variations[POS_FIELD] = numpy.array([10, 20, 10, 20]) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # REF, ALT variations[REF_FIELD] = numpy.array([b'A', b'A', b'A', b'A']) variations[ALT_FIELD] = numpy.array([[b'T'], [b'T'], [b'T'], [b'T']]) write_excel(variations, fhand) # with classifications classes = [1, 1, 1, 2, 2] write_excel(variations, fhand, classes)
def test_report(self): gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]], [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} out_dir = tempfile.TemporaryDirectory() create_pop_stats_report(varis, pops, out_dir.name, min_num_genotypes=1, min_call_dp_for_obs_het=0, violin_ylimits={ 'observed_heterozigosity': { 'bottom': 0, 'top': 0.5 } }) stats_csv_fpath = os.path.join(out_dir.name, 'pop_stats.csv') assert os.path.exists(stats_csv_fpath) stats_csv_fpath = os.path.join(out_dir.name, 'pop_stats_violin_plots.svg') # input(out_dir.name) out_dir.cleanup()
def test_calc_obs_het(self): stat_funct = calc_obs_het gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]], [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) dps = numpy.array([[20, 15, 20, 20, 20], [20, 20, 20, 20, 20], [20, 20, 20, 20, 20]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis[DP_FIELD] = dps varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0.5, 0, math.nan], 2: [0, 1., math.nan]} partial_stat_funct = partial(stat_funct, min_num_genotypes=1, min_call_dp=0) self._check_function(partial_stat_funct, varis, pops, expected) # now setting a depth_threshold expected = {1: [0, 0, math.nan], 2: [0, 1., math.nan]} partial_stat_funct = partial(stat_funct, min_call_dp=20, min_num_genotypes=1) self._check_function(partial_stat_funct, varis, pops, expected)
def test_calc_distrib_for_sample(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') snps = VariationsArrays() snps.put_chunks(hdf5.iterate_chunks()) distrib, _ = calc_field_distrib_for_a_sample(hdf5, field='/calls/DP', sample='1_17_1_gbs', n_bins=15) assert distrib.shape == (15,) distrib2, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=None) assert numpy.all(distrib == distrib2) distrib3, _ = calc_field_distrib_for_a_sample(snps, field='/calls/DP', n_bins=15, sample='1_17_1_gbs', chunk_size=50) assert numpy.all(distrib3 == distrib2) vars_ = VariationsArrays() vars_['/calls/DP'] = numpy.array([[10, 5, 15], [0, 15, 10]]) vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [1, 1]], [[0, 0], [0, 1], [1, 1]]]) vars_.samples = list(range(3)) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_het) expec = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10]) distrib, _ = calc_field_distribs_per_sample(vars_, field='/calls/DP', n_bins=16, mask_field='/calls/GT', mask_func=call_is_hom) expec = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]] assert numpy.all(expec == distrib) assert numpy.all(calc_depth(vars_) == [10, 5, 15, 0, 15, 10])
def test_gst_basic(self): ad = [[[10, 3, -1], [11, 2, -1]], [[10, 0, -1], [10, 0, -1]], [[10, 10, -1], [11, 11, -1]], [[-1, 2, 10], [-1, 10, 2]]] snps = VariationsArrays() snps.samples = [1, 2] populations = [[1], [2]] snps[AD_FIELD] = numpy.array(ad) dist = calc_gst_per_loci(snps, populations) expected = numpy.array([0.00952381, 0, 0, 0.44444444]) numpy.testing.assert_almost_equal(dist, expected)
def test_is_variable_func(self): variations = VariationsArrays() gts = numpy.array([[[-1, -1], [1, 1], [0, 1], [1, 1], [-1, -1]], [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1], [0, 0], [0, 0]]]) variations[GT_FIELD] = gts variations.samples = [1, 2, 3, 4, 5] expected_variable = [MISSING_INT, TRUE_INT, TRUE_INT, FALSE_INT] variable = is_variable(variations, samples=[1, 5]) assert numpy.all(variable == expected_variable)
def test_calc_exp_het(self): stat_funct = calc_exp_het gts = numpy.array([[[0, 0], [0, 1], [0, 0], [0, 0], [0, -1]], [[0, 0], [0, 0], [0, 1], [1, 0], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [0.5, 0, math.nan], 2: [0, 0.6, math.nan]} partial_stat_funct = partial(stat_funct, min_num_genotypes=1) self._check_function(partial_stat_funct, varis, pops, expected)
def test_calc_major_allele_freq(self): stat_funct = calc_major_allele_freq gts = numpy.array([[[0], [0], [0], [0], [-1]], [[0], [0], [1], [1], [-1]], [[0], [2], [0], [1], [2]], [[-1], [-1], [-1], [-1], [-1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4, 5] pops = {1: [1, 2], 2: [3, 4, 5]} expected = {1: [1., 1., 0.5, math.nan], 2: [1., 1., 1 / 3, math.nan]} stat_funct = partial(stat_funct, min_num_genotypes=1) self._check_function(stat_funct, varis, pops, expected)
def test_nei_dist(self): gts = [[[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 0]], [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]], [[0, 0], [0, 0], [0, 0], [1, 1], [1, 1], [1, 1], [1, 1]]] snps = VariationsArrays() snps['/calls/GT'] = numpy.array(gts) snps.samples = [1, 2, 3, 4, 5, 6, 7] pops = [[1, 2, 3], [4, 5, 6, 7]] dists = calc_pop_distance(snps, populations=pops, method='nei', min_num_genotypes=0) assert dists[0] - 3.14019792 < 0.001 pops = [[1, 2, 3], [1, 2, 3]] dists = calc_pop_distance(snps, populations=pops, method='nei', min_num_genotypes=0) assert dists[0] - 0 < 0.001 pops = [[1, 2, 3], [1, 4, 5, 6, 7]] dists = calc_pop_distance(snps, populations=pops, method='nei', min_num_genotypes=0) assert dists[0] - 1.23732507 < 0.001 # by chunk pops = [[1, 2, 3], [4, 5, 6, 7]] dists = calc_pop_distance(snps, populations=pops, method='nei', chunk_size=2, min_num_genotypes=0) assert dists[0] - 3.14019792 < 0.001 pops = [[1, 2, 3], [1, 2, 3]] dists = calc_pop_distance(snps, populations=pops, method='nei', chunk_size=2, min_num_genotypes=0) assert dists[0] - 0 < 0.001 pops = [[1, 2, 3], [1, 4, 5, 6, 7]] dists = calc_pop_distance(snps, populations=pops, method='nei', chunk_size=2, min_num_genotypes=0) assert dists[0] - 1.23732507 < 0.001
def test_pca(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') do_pca(hdf5) varis = VariationsArrays() gts = [[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], ] gts = numpy.array(gts) varis[GT_FIELD] = gts varis.samples = ['a', 'b', 'c'] res = do_pca(varis) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert numpy.allclose(projs[0], projs[1]) assert not numpy.allclose(projs[0], projs[2])
def test_pca(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') do_pca(hdf5) varis = VariationsArrays() gts = [ [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], ] gts = numpy.array(gts) varis[GT_FIELD] = gts varis.samples = ['a', 'b', 'c'] res = do_pca(varis) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert numpy.allclose(projs[0], projs[1]) assert not numpy.allclose(projs[0], projs[2])
def test_samples(self): gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]], [[0, 0], [0, 0], [1, 1], [2, 2]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] assert varis.samples == [1, 2, 3, 4] # With another file tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand, pre_read_max_size=1000) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() samples = h5.samples samples[0] = '0' h5.samples = samples
def test_samples(self): gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]], [[0, 0], [0, 0], [1, 1], [2, 2]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) varis = VariationsArrays() varis[GT_FIELD] = gts varis.samples = [1, 2, 3, 4] assert varis.samples == [1, 2, 3, 4] # With another file tmp_fhand = NamedTemporaryFile() path = tmp_fhand.name tmp_fhand.close() fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb') vcf_parser = VCFParser(fhand=fhand) h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True) h5.put_vars(vcf_parser) fhand.close() samples = h5.samples samples[0] = '0' h5.samples = samples
def test_excel(self): variations = VariationsArrays() gts = numpy.array([[[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]], [[2, 2], [2, 0], [2, 1], [0, 0], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]], [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations[GT_FIELD] = gts variations.samples = list(range(gts.shape[1])) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # chrom pos variations[CHROM_FIELD] = numpy.array([1, 1, 2, 2]) variations[POS_FIELD] = numpy.array([10, 20, 10, 20]) fhand = NamedTemporaryFile(suffix='.xlsx') write_excel(variations, fhand) # REF, ALT variations[REF_FIELD] = numpy.array(['A', 'A', 'A', 'A']) variations[ALT_FIELD] = numpy.array([['T'], ['T'], ['T'], ['T']]) write_excel(variations, fhand)
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.dtype(bool)) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [ samples[idx] for idx, keep in enumerate(sample_cols) if keep ] filtered_vars.samples = kept_samples return filtered_vars
def test_kosman_pairwise_between_pops_by_chunk(self): a = numpy.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = numpy.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) c = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) d = numpy.full(shape=(11, 2), fill_value=1, dtype=numpy.int16) gts = numpy.stack((a, b, c, d), axis=0) gts = numpy.transpose(gts, axes=(1, 0, 2)).astype(numpy.int16) variations = VariationsArrays() variations.samples = [1, 2, 3, 4] variations['/calls/GT'] = gts expected = [[0., 0.33333333, 0.75, 0.75], [0.33333333, 0., 0.45, 0.45], [0.75, 0.45, 0., 0.], [0.75, 0.45, 0., 0.]] distance = calc_pairwise_distances_between_pops( variations, chunk_size=None, min_num_snps=1, pop1_samples=[1, 2, 3, 4], pop2_samples=[1, 2, 3, 4]) assert numpy.allclose(distance, expected) expected = [[0., 0.33333333, 0.75, 0.75]] distance = calc_pairwise_distances_between_pops( variations, chunk_size=None, min_num_snps=1, pop1_samples=[1], pop2_samples=[1, 2, 3, 4]) assert numpy.allclose(distance, expected) expected = [[0.75, 0.75], [0.45, 0.45]] distance = calc_pairwise_distances_between_pops(variations, chunk_size=None, min_num_snps=1, pop1_samples=[1, 2], pop2_samples=[3, 4])
def test_calc_called_gts_distribution_per_depth(self): hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') dist, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30), chunk_size=10) assert dist[1, 1] == 1 dist2, _ = calc_called_gts_distrib_per_depth(hdf5, depths=range(30), chunk_size=None) assert numpy.all(dist == dist2) vars_ = VariationsArrays() vars_['/calls/GT'] = numpy.array([[[0, 0], [0, 1], [0, 1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [1, 1], [0, 0]]]) vars_['/calls/DP'] = numpy.array([[10, 5, 15, 7, 10, 0, 0, 25, 20, 10]]) vars_.samples = list(range(10)) dist, _ = calc_called_gts_distrib_per_depth(vars_, depths=[0, 5, 10, 30]) expected = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]] assert numpy.all(dist == expected)
def test_calc_maf_depth_distribs_per_sample(self): variations = VariationsArrays() variations['/calls/AO'] = numpy.array([]) variations['/calls/RO'] = numpy.array([]) distribs, bins = calc_maf_depth_distribs_per_sample(variations, chunk_size=None) assert distribs is None assert bins is None variations = VariationsArrays() variations['/calls/AO'] = numpy.array([[[0, 0], [0, 0], [15, -1]]]) variations['/calls/RO'] = numpy.array([[10, 5, 15]]) variations.samples = list(range(3)) distribs, _ = calc_maf_depth_distribs_per_sample(variations, n_bins=4, min_depth=6, chunk_size=None) expected = [[0, 0, 0, 1], [0, 0, 0, 0], [0, 0, 1, 0]] assert numpy.all(distribs == expected) hdf5 = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r') distribs1, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6, chunk_size=None) distribs2, _ = calc_maf_depth_distribs_per_sample(hdf5, min_depth=6) assert numpy.all(distribs1 == distribs2)
def _filter_samples_by_index(variations, sample_cols, filtered_vars=None, reverse=False): if filtered_vars is None: filtered_vars = VariationsArrays() samples = variations.samples try: dtype = sample_cols.dtype is_bool = numpy.issubdtype(dtype, numpy.bool) except AttributeError: item = first(iter(sample_cols)) is_bool = isinstance(item, bool) if not is_bool: sample_cols = [idx in sample_cols for idx in range(len(samples))] if 'shape' not in dir(sample_cols): sample_cols = numpy.array(sample_cols, dtype=numpy.bool) if reverse: sample_cols = numpy.logical_not(sample_cols) for path in variations.keys(): matrix = variations[path] if is_dataset(matrix): matrix = matrix[:] if 'calls' in path: flt_data = matrix[:, sample_cols] # flt_data = numpy.compress(sample_cols, , axis=1) filtered_vars[path] = flt_data else: filtered_vars[path] = matrix filtered_vars.metadata = variations.metadata kept_samples = [samples[idx] for idx, keep in enumerate(sample_cols) if keep] filtered_vars.samples = kept_samples return filtered_vars
def test_diploid_writing(self): variations = VariationsArrays() gts = numpy.array([ [[0, 0], [2, 2], [1, 1], [0, 0], [0, 0]], [[2, 2], [1, 1], [-1, 2], [0, 0], [-1, -1]], [[0, 1], [0, 0], [0, 0], [1, 1], [0, 0]], [[0, 0], [1, -1], [1, 1], [1, -1], [1, 1]], [[0, 1], [0, 1], [-1, -1], [-1, 1], [1, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]], ]) ref = numpy.array([b'C', b'G', b'A', b'T', b'T', b'C', b'G']) alt = numpy.array([[b'CT', b'CTT'], [b'GA', b'GAT'], [b'C', b''], [b'G', b''], [b'A', b''], [b'G', b''], [b'C', b'']]) variations[GT_FIELD] = gts variations[ALT_FIELD] = alt variations[REF_FIELD] = ref variations[CHROM_FIELD] = numpy.array( ['ch1', 'ch2', 'ch2', 'ch2', 'ch2', 'ch3', 'ch3']) variations[POS_FIELD] = numpy.array([10, 20, 30, 40, 50, 10, 15]) variations.samples = list(map(str, range(gts.shape[1]))) fhand = io.BytesIO() write_fasta(variations, fhand, remove_invariant_snps=True, remove_indels=False, try_to_align_easy_indels=True, write_one_seq_per_sample_setting_hets_to_missing=False) # SNPS # C-- C-T CTT # G-- GA- GAT # A C # T G # T A # haps # 00 22 11 00 00 # 22 11 N2 00 NN # 01 00 00 11 00 # 00 1N 11 1N 11 # 01 01 NN N1 10 # # indi0_h1> C--GATATT # indi0_h2> C--GATCTA # indi1_h1> CTTGA-ATT # indi1_h2> CTTGA-ANA # indi2_h1> C-TNNNAGN # indi2_h2> C-TGATAGN # indi3_h1> C--G--CGN # indi3_h2> C--G--CNA # indi4_h1> C--NNNAGA # indi4_h2> C--NNNAGT result = fhand.getvalue().splitlines() assert b'>0_hap1' in result[0] assert result[1] == b'C--GATATT' assert b'>0_hap2' in result[2] assert result[3] == b'C--GATCTA' assert b'>1_hap1' in result[4] assert result[5] == b'CTTGA-AGT' assert b'>1_hap2' in result[6] assert result[7] == b'CTTGA-ANA' assert b'>2_hap1' in result[8] assert result[9] == b'C-TNNNAGN' assert b'>2_hap2' in result[10] assert result[11] == b'C-TGATAGN' assert b'>3_hap1' in result[12] assert result[13] == b'C--G--CGN' assert b'>3_hap2' in result[14] assert result[15] == b'C--G--CNA' assert b'>4_hap1' in result[16] assert result[17] == b'C--NNNAGA' assert b'>4_hap2' in result[18] assert result[19] == b'C--NNNAGT'
def test_fasta_writer(self): variations = VariationsArrays() gts = numpy.array([ [[0, 0], [0, 1], [1, 1], [0, 0], [0, 0]], [[2, 2], [1, 1], [-1, 2], [0, 0], [-1, -1]], [[0, 1], [0, 0], [0, 0], [1, 1], [0, 0]], [[0, 0], [1, -1], [1, 1], [1, -1], [1, 1]], [[0, 1], [0, 1], [-1, -1], [-1, 1], [1, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]], ]) ref = numpy.array(['C', 'G', 'A', 'T', 'T', 'C', 'C']) alt = numpy.array([['A', 'TT'], ['A', 'T'], ['C', ''], ['G', ''], ['A', 'T'], ['G', ''], ['G', '']]) variations[GT_FIELD] = gts variations[ALT_FIELD] = alt variations[REF_FIELD] = ref variations[CHROM_FIELD] = numpy.array( ['ch1', 'ch2', 'ch2', 'ch2', 'ch2', 'ch3', 'ch3']) variations[POS_FIELD] = numpy.array([10, 20, 30, 40, 50, 10, 15]) variations.samples = list(map(str, range(gts.shape[1]))) fhand = io.BytesIO() write_fasta(variations, fhand, remove_invariant_snps=True, write_one_seq_per_sample_setting_hets_to_missing=True) # SNPS # C A TT # G A T # A C # T G # T A T # C G # N # indi1> TNT # indi2> AAN # indi3> NAG # indi4> GCN # indi5> NAG result = fhand.getvalue().splitlines() assert b'>0' in result[0] assert result[1] == b'TNT' assert b'>1' in result[2] assert result[3] == b'AAN' assert b'>2' in result[4] assert result[5] == b'NAG' assert b'>3' in result[6] assert result[7] == b'GCN' assert b'>4' in result[8] assert result[9] == b'NAG' fhand = io.BytesIO() write_fasta(variations, fhand, remove_invariant_snps=True, write_one_seq_per_sample_setting_hets_to_missing=True) result = fhand.getvalue().splitlines() assert b'>0' in result[0] assert result[1] == b'TNT' fhand = io.BytesIO() write_fasta(variations, fhand, remove_sites_all_N=True, write_one_seq_per_sample_setting_hets_to_missing=True) result = fhand.getvalue().decode().splitlines() assert '>0' in result[0] assert result[1] == 'TNTC'
def test_fasta_writer_with_indels(self): variations = VariationsArrays() gts = numpy.array([ [[0, 0], [2, 2], [1, 1], [0, 0], [0, 0]], [[2, 2], [1, 1], [-1, 2], [0, 0], [-1, -1]], [[0, 1], [0, 0], [0, 0], [1, 1], [0, 0]], [[0, 0], [1, -1], [1, 1], [1, -1], [1, 1]], [[0, 1], [0, 1], [-1, -1], [-1, 1], [1, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]], ]) ref = numpy.array([b'C', b'G', b'A', b'T', b'T', b'C', b'G']) alt = numpy.array([[b'CT', b'CTT'], [b'GA', b'GAT'], [b'C', b''], [b'G', b''], [b'A', b'T'], [b'G', b''], [b'C', b'']]) variations[GT_FIELD] = gts variations[ALT_FIELD] = alt variations[REF_FIELD] = ref variations[CHROM_FIELD] = numpy.array( ['ch1', 'ch2', 'ch2', 'ch2', 'ch2', 'ch3', 'ch3']) variations[POS_FIELD] = numpy.array([10, 20, 30, 40, 50, 10, 15]) variations.samples = list(map(str, range(gts.shape[1]))) fhand = io.BytesIO() write_fasta(variations, fhand, remove_invariant_snps=True, remove_indels=False, try_to_align_easy_indels=True, write_one_seq_per_sample_setting_hets_to_missing=True) # SNPS # C-- C-T CTT # G-- GA- GAT # A C # T G # haps # 0 2 1 0 0 # 2 1 H 0 N # H 0 0 1 0 # 0 H 1 H 1 # indi1> C--GATNT # indi2> CTTGA-AN # indi3> C-TNNNAG # indi4> C--G--CN # indi5> C--NNNAG result = fhand.getvalue().splitlines() assert b'>0' in result[0] assert result[1] == b'C--GATNT' assert b'>1' in result[2] assert result[3] == b'CTTGA-AN' assert b'>2' in result[4] assert result[5] == b'C-TNNNAG' assert b'>3' in result[6] assert result[7] == b'C--G--CN' assert b'>4' in result[8] assert result[9] == b'C--NNNAG' fhand = io.BytesIO() write_fasta(variations, fhand, remove_invariant_snps=True, remove_indels=False, put_hyphens_in_indels=False, write_one_seq_per_sample_setting_hets_to_missing=True) result = fhand.getvalue().splitlines() assert b'>0' in result[0] assert result[1] == b'CGATNT' assert b'>1' in result[2] assert result[3] == b'CTTGAAN' assert b'>2' in result[4] assert result[5] == b'CTNNNAG' assert b'>3' in result[6] assert result[7] == b'CGCN' assert b'>4' in result[8] assert result[9] == b'CNNNAG'