def test_empty_pop(self): missing = (-1, -1) gts = [ [(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0], [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0], [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] variations = Variations() variations.samples = da.from_array(np.array(samples)) variations[GT_FIELD] = da.from_array(np.array(gts)) variations[DP_FIELD] = da.from_array(np.array(dps)) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=0) assert np.allclose(dists, [0.65490196]) gts = [ [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], [ missing, missing, missing, missing, missing, (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1) ], ] dps = [[0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0], [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0], [0, 0, 0, 0, 0, 20, 20, 20, 20, 20, 0]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] variations = Variations() variations.samples = da.from_array(np.array(samples)) variations[GT_FIELD] = da.from_array(np.array(gts)) variations[DP_FIELD] = da.from_array(np.array(dps)) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=0) assert np.isnan(dists[0])
def test_calc_missing_memory(self): variations = Variations() gts = np.array([[[0, 0], [0, 0]], [[0, 0], [-1, -1]], [[0, 0], [-1, -1]], [[-1, -1], [-1, -1]]]) samples = [str(i) for i in range(gts.shape[1])] variations.samples = np.array(samples) variations[GT_FIELD] = gts result = calc_missing_gt(variations, rates=False) expected = np.array([2, 1, 1, 0]) assert np.all(result == 2 - expected) gts = np.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, -1]], [[0, 0], [0, 0], [0, 0], [0, 0], [-1, -1]], [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1]], [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) samples = [str(i) for i in range(gts.shape[1])] variations = Variations() variations.samples = np.array(samples) variations[GT_FIELD] = gts result = calc_missing_gt(variations, rates=False) # result = compute(task) expected = np.array([0.5, 1, 2, 4]) assert np.all(result == expected)
def test_kosman_missing_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_cd = _kosman(vars1, vars2) assert np.all(distance_ab == distance_cd)
def test_nei_dist_in_memory(self): gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations = Variations() variations.samples = np.array([1, 2, 3, 4]) variations[GT_FIELD] = gts pops = [[1, 2], [3, 4]] dists = calc_pop_pairwise_unbiased_nei_dists( variations, max_alleles=6, populations=pops, silence_runtime_warnings=True, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797) # all missing gts = np.array([[[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations = Variations() variations.samples = np.array([1, 2, 3, 4]) variations[GT_FIELD] = gts pops = [[1, 2], [3, 4]] dists = calc_pop_pairwise_unbiased_nei_dists( variations, max_alleles=1, populations=pops, silence_runtime_warnings=True, min_num_genotypes=1) assert math.isnan(dists[0]) # min_num_genotypes gts = np.array([[[1, 1], [5, 2], [2, 2], [3, 2]], [[1, 1], [1, 2], [2, 2], [2, 1]], [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]]) variations = Variations() variations.samples = np.array([1, 2, 3, 4]) variations[GT_FIELD] = gts pops = [[1, 2], [3, 4]] dists = calc_pop_pairwise_unbiased_nei_dists( variations, max_alleles=6, populations=pops, silence_runtime_warnings=True, min_num_genotypes=1) assert math.isclose(dists[0], 0.3726315908494797)
def test_kosman_2_indis(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 1 / 3 c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0 variations = Variations() gts = np.stack((b, d), axis=1) samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0.45
def test_dest_jost_distance_in_memory(self): gts = [[(1, 1), (1, 3), (1, 2), (1, 4), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)], [(1, 3), (1, 1), (1, 1), (1, 3), (3, 3), (3, 2), (3, 4), (2, 2), (2, 4), (4, 4), (-1, -1)]] samples = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] pops = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] dps = [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20], [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]] variations = Variations() variations.samples = np.array(samples) variations[GT_FIELD] = np.array(gts) variations[DP_FIELD] = np.array(dps) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=0) assert np.allclose(dists, [0.65490196]) dists = calc_dset_pop_distance(variations, max_alleles=5, silence_runtime_warnings=True, populations=pops, min_num_genotypes=6) assert np.all(np.isnan(dists))
def test_kosman_2_indis_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) va.make_sure_array_is_in_memory(distance_ab) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 1 / 3 c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0 variations = Variations() gts = np.stack((b, d), axis=1) samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0.45
def test_unavailable_shape(self): variations = Variations() variations.samples = ['1', '2', '3'] gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) variations[GT_FIELD] = gts assert variations.num_variations == 3 variations = _create_empty_dask_variations() try: variations.num_variations self.fail('NotMaterializedError expected') except NotMaterializedError: pass
def xtest_do_pca(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') do_pca(variations) gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]]) variations = Variations() variations.samples = da.from_array(np.array(['a', 'b', 'c'])) variations[GT_FIELD] = da.from_array(gts) res = do_pca(variations) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert np.allclose(projs[0], projs[1]) assert not np.allclose(projs[0], projs[2])
def test_kosman_missing(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_cd = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) assert np.all(distance_ab == distance_cd)
def test_iterate_chunks(self): # in memory variations = Variations() variations.samples = ['1', '2', '3'] gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) variations[GT_FIELD] = gts for index, chunk in enumerate(variations.iterate_chunks(chunk_size=1)): assert np.all(chunk[GT_FIELD] == variations[GT_FIELD][index, :]) assert np.all(chunk.samples == variations.samples) # in disk variations = load_zarr((TEST_DATA_DIR / 'test.zarr'), num_vars_per_chunk=1) chunks = list(variations.iterate_chunks()) self.assertEqual(len(chunks), 7)
def test_basic_operations(self): variations = Variations() self.assertEqual(variations.num_variations, 0) self.assertEqual(variations.num_samples, 0) gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) # trying to add call data without samples fails with self.assertRaises(ValueError) as _: variations[GT_FIELD] = gts # set samples variations.samples = ['1', '2', '3'] self.assertEqual(variations.num_samples, 3) # adding again samples fails with self.assertRaises(RuntimeError) as _: variations.samples = ['1', '2', '3'] # add variationData chroms = np.array(['chr1', 'chr2', 'chr3']) variations[CHROM_FIELD] = chroms # add data with wrong shape with self.assertRaises(ValueError) as context: variations[GT_FIELD] = gts = np.array([[1, 2, 3]]) self.assertIn('Introduced matrix shape', str(context.exception)) with self.assertRaises(ValueError) as context: variations[GT_FIELD] = gts = np.array([[1, 2], [1, 2], [1, 2]]) self.assertIn('not fit with num samples', str(context.exception)) # set gt array gts = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) variations[GT_FIELD] = gts self.assertTrue(np.array_equal(gts, variations[GT_FIELD])) self.assertEqual(variations.num_variations, 3)
def test_do_pca_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] do_pca(variations) gts = np.array([[[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [1, 1]]]) variations = Variations() variations.samples = da.from_array(np.array(['a', 'b', 'c'])) variations[GT_FIELD] = da.from_array(gts) res = do_pca(variations) projs = res['projections'] assert projs.shape[0] == gts.shape[1] assert np.allclose(projs[0], projs[1]) assert not np.allclose(projs[0], projs[2])
def test_kosman_pairwise_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 2]]) c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((a, b, c, d), axis=0) gts = np.transpose(gts, axes=(1, 0, 2)).astype(np.int16) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts distances, samples = calc_kosman_dist(variations) expected = [0.33333333, 0.75, 0.75, 0.5, 0.5, 0.] assert np.allclose(distances, expected)
def test_filter_obs_het_in_mem(self): variations = Variations() gts = np.array([[[0, 0], [1, 1], [0, 1], [1, 1], [0, 0]], [[0, 0], [0, 0], [0, 0], [0, 0], [1, 1]], [[0, 0], [0, 0], [0, 0], [0, 0], [0, 1]], [[0, 0], [0, 0], [0, 1], [0, 0], [1, 1]]]) variations.samples = da.from_array([1, 2, 3, 4, 5]) variations[GT_FIELD] = da.from_array(gts) variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] filtered = filter_by_obs_heterocigosis(variations, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts) assert filtered[FLT_STATS][N_KEPT] == 4 assert filtered[FLT_STATS][N_FILTERED_OUT] == 0 filtered = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]]) assert filtered[FLT_STATS][N_KEPT] == 3 assert filtered[FLT_STATS][N_FILTERED_OUT] == 1 filtered = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, min_num_genotypes=10) # filtered = compute(task, store_variation_to_memory=True, # silence_runtime_warnings=True) assert filtered[FLT_STATS][N_KEPT] == 0 assert filtered[FLT_STATS][N_FILTERED_OUT] == 4 filtered = filter_by_obs_heterocigosis(variations, max_allowable_het=0.1, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[1]]) filtered = filter_by_obs_heterocigosis(variations, min_allowable_het=0.2, max_allowable_het=0.3, min_num_genotypes=0) # filtered = compute(task, store_variation_to_memory=True) assert np.all(filtered[FLT_VARS][GT_FIELD] == gts[[0, 2, 3]])
def compute(data, store_variation_to_memory=False, silence_runtime_warnings=False): if isinstance(data, (Delayed, da.Array)): with warnings.catch_warnings(): if silence_runtime_warnings: warnings.filterwarnings("ignore", category=RuntimeWarning) return data.compute() res = _collect_cargo_to_compute( data, store_variation_to_memory=store_variation_to_memory) darrays_to_compute, orig_keys, orig_dicts, variation_info = res in_memory_variations = None with warnings.catch_warnings(): if silence_runtime_warnings: warnings.filterwarnings("ignore", category=RuntimeWarning) computed_darrays = dask.compute(*darrays_to_compute) for idx, computed_darray in enumerate(computed_darrays): key = orig_keys[idx] dict_in_which_the_result_was_stored = orig_dicts[idx] if (isinstance(dict_in_which_the_result_was_stored, Variations) and store_variation_to_memory): if in_memory_variations is None: in_memory_variations = Variations( metadata=variation_info['metadata']) if key == 'samples': in_memory_variations.samples = computed_darray else: in_memory_variations[key] = computed_darray else: dict_in_which_the_result_was_stored[key] = computed_darray if variation_info['key']: if store_variation_to_memory: data[variation_info['key']] = in_memory_variations else: del data[variation_info['key']] return data