def test_kosman_missing_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_cd = _kosman(vars1, vars2) assert np.all(distance_ab == distance_cd)
def test_calc_missing(self): variations = create_non_materialized_snp_filtered_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=False) result = compute({'num_missing_gts': task}) self.assertTrue( np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1])) variations = create_non_materialized_snp_filtered_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=True) result = compute({'num_missing_gts': task}) expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5] for a, b in zip(result['num_missing_gts'], expected): self.assertAlmostEqual(a, b, places=2) variations = create_dask_variations() task = calc_missing_gt_per_sample(variations, rates=True) result = compute({'num_missing_gts': task}) variations = create_non_materialized_snp_filtered_variations() try: task = calc_missing_gt_per_sample(variations, rates=True) self.fail('NotMaterializedError expected') except NotMaterializedError: pass variations = create_dask_variations() task = calc_missing_gt_per_sample(variations, rates=False) result = compute({'num_missing_gts': task})
def test_calc_missing(self): variations = _create_dask_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=False) result = compute({'num_missing_gts': task}) self.assertTrue( np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1])) variations = _create_dask_variations() variations = keep_samples(variations, samples=['pepo', 'upv196'])[FLT_VARS] task = calc_missing_gt(variations, rates=True) result = compute({'num_missing_gts': task}) expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5] for a, b in zip(result['num_missing_gts'], expected): self.assertAlmostEqual(a, b, places=2)
def test_keep_samples_in_memory(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') # print(variations.samples.compute()) # print(variations[DP_FIELD].compute()) variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] samples = ['upv196', 'pepo'] processed = keep_samples(variations, samples=samples) dps = processed[FLT_VARS][DP_FIELD] self.assertTrue( np.all(processed[FLT_VARS].samples == ['pepo', 'upv196'])) expected = [[-1, 9], [-1, 8], [-1, 8], [14, 6], [-1, -1], [-1, -1], [-1, 6]] self.assertTrue(np.all(dps == expected))
def test_keep_samples_in_memory(self): variations = create_dask_variations() # print(variations.samples.compute()) # print(variations[DP_FIELD].compute()) variations = compute({'vars': variations}, store_variation_to_memory=True)['vars'] samples = ['upv196', 'pepo'] processed = keep_samples(variations, samples=samples) dps = processed[FLT_VARS][DP_FIELD] self.assertTrue( np.all(processed[FLT_VARS].samples == ['upv196', 'pepo'])) expected = [[9, -1], [8, -1], [8, -1], [6, 14], [-1, -1], [-1, -1], [6, -1]] self.assertTrue(np.all(dps == expected))
def test_kosman_missing(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_cd = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) assert np.all(distance_ab == distance_cd)
def test_kosman_2_indis(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 1 / 3 c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0 variations = Variations() gts = np.stack((b, d), axis=1) samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = da.from_array(samples) variations[GT_FIELD] = da.from_array(gts) vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] snp_by_snp_compartion_array = _kosman(vars1, vars2) distance_ab = compute(snp_by_snp_compartion_array, silence_runtime_warnings=True) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0.45
def calc_kosman_dist(variations, min_num_snps=None, silence_runtime_warning=False): variations_by_sample = OrderedDict() samples = va.make_sure_array_is_in_memory( variations.samples, silence_runtime_warnings=silence_runtime_warning) for sample in samples: variations_by_sample[sample] = keep_samples(variations, [sample])[FLT_VARS] sample_combinations = combinations(samples, 2) distances_by_pair = OrderedDict() for sample1, sample2 in sample_combinations: vars1 = variations_by_sample[sample1] vars2 = variations_by_sample[sample2] snp_by_snp_comparation_array = _kosman(vars1, vars2) distances_by_pair[(sample1, sample2)] = snp_by_snp_comparation_array computed_distances_by_pair = compute( distances_by_pair, silence_runtime_warnings=silence_runtime_warning) distances = [] for sample_index, sample in enumerate(samples): starting_index2 = sample_index + 1 if starting_index2 >= len(samples): break for sample2 in samples[starting_index2:]: result = computed_distances_by_pair[(sample, sample2)] n_snps = result.shape[0] if min_num_snps is not None and n_snps < min_num_snps: value = 0.0 else: with np.errstate(invalid='ignore'): value = np.sum(result) / result.shape[0] distances.append(value) return distances, samples
def test_kosman_2_indis_in_memory(self): a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]]) b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]]) gts = np.stack((a, b), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) va.make_sure_array_is_in_memory(distance_ab) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 1 / 3 c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16) gts = np.stack((c, d), axis=1) variations = Variations() samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0 variations = Variations() gts = np.stack((b, d), axis=1) samples = np.array([str(i) for i in range(gts.shape[1])]) variations.samples = samples variations[GT_FIELD] = gts vars1 = keep_samples(variations, ['0'])[FLT_VARS] vars2 = keep_samples(variations, ['1'])[FLT_VARS] distance_ab = _kosman(vars1, vars2) distance = distance_ab.sum() / distance_ab.shape[0] assert distance == 0.45
def calc_pop_pairwise_unbiased_nei_dists( variations, max_alleles, populations, silence_runtime_warnings=False, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): pop_ids = list(range(len(populations))) variations_per_pop = [ keep_samples(variations, pop_samples)[FLT_VARS] for pop_samples in populations ] Jxy = {} uJx = {} uJy = {} for pop_id1, pop_id2 in combinations(pop_ids, 2): if pop_id1 not in Jxy: Jxy[pop_id1] = {} if pop_id1 not in uJx: uJx[pop_id1] = {} if pop_id1 not in uJy: uJy[pop_id1] = {} Jxy[pop_id1][pop_id2] = None uJx[pop_id1][pop_id2] = None uJy[pop_id1][pop_id2] = None for pop_id1, pop_id2 in combinations(pop_ids, 2): vars_for_pop1 = variations_per_pop[pop_id1] vars_for_pop2 = variations_per_pop[pop_id2] _accumulate_j_stats(vars_for_pop1, vars_for_pop2, max_alleles, Jxy, uJx, uJy, pop_id1, pop_id2, min_num_genotypes=min_num_genotypes) computed_result = compute( { 'Jxy': Jxy, 'uJx': uJx, 'uJy': uJy }, silence_runtime_warnings=silence_runtime_warnings) computedJxy = computed_result['Jxy'] computeduJx = computed_result['uJx'] computeduJy = computed_result['uJy'] n_pops = len(populations) dists = np.empty(int((n_pops**2 - n_pops) / 2)) dists[:] = np.nan for idx, (pop_id1, pop_id2) in enumerate(combinations(pop_ids, 2)): if Jxy[pop_id1][pop_id2] is None: unbiased_nei_identity = math.nan else: with np.errstate(invalid='ignore'): unbiased_nei_identity = computedJxy[pop_id1][ pop_id2] / math.sqrt(computeduJx[pop_id1][pop_id2] * computeduJy[pop_id1][pop_id2]) nei_unbiased_distance = -math.log(unbiased_nei_identity) if nei_unbiased_distance < 0: nei_unbiased_distance = 0 dists[idx] = nei_unbiased_distance return dists
def calc_dset_pop_distance(variations, max_alleles, populations, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, min_call_dp_for_het=0, silence_runtime_warnings=False): '''This is an implementation of the formulas proposed in GenAlex''' pop_ids = list(range(len(populations))) variations_per_pop = [ keep_samples(variations, pop_samples)[FLT_VARS] for pop_samples in populations ] accumulated_dists = {} accumulated_hs = {} accumulated_ht = {} num_vars = {} for pop_id1, pop_id2 in combinations(pop_ids, 2): vars_for_pop1 = variations_per_pop[pop_id1] vars_for_pop2 = variations_per_pop[pop_id2] res = _calc_pairwise_dest(vars_for_pop1, vars_for_pop2, max_alleles=max_alleles, min_call_dp_for_het=min_call_dp_for_het, min_num_genotypes=min_num_genotypes) res['corrected_hs'] res['corrected_ht'] num_vars_in_chunk = va.count_nonzero(~va.isnan(res['corrected_hs'])) hs_in_chunk = va.nansum(res['corrected_hs']) ht_in_chunk = va.nansum(res['corrected_ht']) key = (pop_id1, pop_id2) if key in accumulated_dists: accumulated_hs[key] += hs_in_chunk accumulated_ht[key] += ht_in_chunk num_vars[key] += num_vars_in_chunk else: accumulated_hs[key] = hs_in_chunk accumulated_ht[key] = ht_in_chunk num_vars[key] = num_vars_in_chunk task = { 'accumulated_hs': accumulated_hs, 'accumulated_ht': accumulated_ht, 'num_vars': num_vars } result = compute(task, silence_runtime_warnings=silence_runtime_warnings) computed_accumulated_hs = result['accumulated_hs'] computed_accumulated_ht = result['accumulated_ht'] computed_num_vars = result['num_vars'] tot_n_pops = len(populations) dists = np.empty(int((tot_n_pops**2 - tot_n_pops) / 2)) dists[:] = np.nan num_pops = 2 for idx, (pop_id1, pop_id2) in enumerate(combinations(pop_ids, 2)): key = pop_id1, pop_id2 if key in accumulated_hs: with np.errstate(invalid='ignore'): corrected_hs = computed_accumulated_hs[ key] / computed_num_vars[key] corrected_ht = computed_accumulated_ht[ key] / computed_num_vars[key] dest = (num_pops / (num_pops - 1)) * ((corrected_ht - corrected_hs) / (1 - corrected_hs)) else: dest = np.nan dists[idx] = dest return dists
def calc_pop_pairwise_nei_dists_by_depth(variations, populations, silence_runtime_warnings=False): variations_per_pop = [ keep_samples(variations, pop_samples)[FLT_VARS] for pop_samples in populations ] jxy = {} jxx = {} jyy = {} for pop_i, pop_j in combinations(range(len(populations)), 2): pop_i_vars = variations_per_pop[pop_i] pop_j_vars = variations_per_pop[pop_j] freq_al_i = calc_allele_freq_by_depth(pop_i_vars) freq_al_j = calc_allele_freq_by_depth(pop_j_vars) chunk_jxy = va.nansum(freq_al_i * freq_al_j) chunk_jxx = va.nansum(freq_al_i**2) chunk_jyy = va.nansum(freq_al_j**2) pop_idx = pop_i, pop_j if pop_idx not in jxy: jxy[pop_idx] = 0 jxx[pop_idx] = 0 jyy[pop_idx] = 0 # The real Jxy is usually divided by num_snps, but it does not # not matter for the calculation jxy[pop_idx] += chunk_jxy jxx[pop_idx] += chunk_jxx jyy[pop_idx] += chunk_jyy computed_result = compute( { 'jxy': jxy, 'jxx': jxx, 'uJy': jyy }, silence_runtime_warnings=silence_runtime_warnings) computedjxy = computed_result['jxy'] computedjxx = computed_result['jxx'] computedjyy = computed_result['jyy'] n_pops = len(populations) dists = np.zeros(int((n_pops**2 - n_pops) / 2)) index = 0 for pop_idx in combinations(range(len(populations)), 2): pjxy = computedjxy[pop_idx] pjxx = computedjxx[pop_idx] pjyy = computedjyy[pop_idx] try: nei = math.log(pjxy / math.sqrt(pjxx * pjyy)) if nei != 0: nei = -nei except ValueError: nei = float('inf') dists[index] = nei index += 1 return dists