def test_sum(self): np_array = np.array([1, 2, 3, 4, 5]) self.assertEqual(va.sum(np_array), 15) da_array = da.from_array(np_array) task = va.sum(da_array) self.assertEqual(task.compute(), 15)
def calc_diversities(variations, max_alleles, min_num_genotypes, min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET, polymorphic_threshold=0.95): diversities = {} mafs = calc_maf_by_gt(variations, max_alleles, min_num_genotypes=min_num_genotypes) mafs_no_nan = mafs[va.logical_not(va.isnan(mafs))] num_variable_vars = va.sum(mafs_no_nan < 0.9999999999) diversities['num_variable_vars'] = num_variable_vars snp_is_poly = mafs_no_nan <= polymorphic_threshold num_poly = va.sum(snp_is_poly) diversities['num_polymorphic_vars'] = num_poly exp_het = calc_expected_het(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) diversities['exp_het'] = va.nanmean(exp_het) obs_het = calc_obs_het(variations, min_call_dp_for_het_call=min_call_dp_for_het_call, min_num_genotypes=min_num_genotypes) diversities['obs_het'] = va.nanmean(obs_het) diversities['num_total_variations'] = variations.num_variations return diversities
def calc_allele_freq_by_depth(variations): allele_counts = variations[AD_FIELD] allele_counts[allele_counts == -1] = 0 allele_counts = va.sum(allele_counts, axis=1) total_counts = va.sum(allele_counts, axis=1) allele_freq = allele_counts / total_counts[:, None] return allele_freq
def keep_variable_variations(variations, max_alleles, filter_id='variable_variations'): gts = variations[GT_FIELD] some_not_missing_gts = va.any(gts != MISSING_INT, axis=2) selected_vars1 = va.any(some_not_missing_gts, axis=1) allele_counts = count_alleles(gts, max_alleles=max_alleles, count_missing=False) num_alleles_per_snp = va.sum(allele_counts > 0, axis=1) selected_vars2 = num_alleles_per_snp > 1 selected_vars = va.logical_and(selected_vars1, selected_vars2) selected_variations = variations.get_vars(selected_vars) num_selected_vars = va.count_nonzero(selected_vars) num_filtered = va.count_nonzero(va.logical_not(selected_vars)) flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered} return { FLT_VARS: selected_variations, FLT_ID: filter_id, FLT_STATS: flt_stats }
def _calc_obs_het_counts(variations, axis, min_call_dp_for_het_call, max_call_dp_for_het_call=None): is_missing = va.any(variations[GT_FIELD] == MISSING_INT, axis=2) if min_call_dp_for_het_call is not None or max_call_dp_for_het_call is not None: dps = variations[DP_FIELD] if min_call_dp_for_het_call is not None: low_dp = dps < min_call_dp_for_het_call is_missing = va.logical_or(is_missing, low_dp) if max_call_dp_for_het_call is not None: high_dp = dps > max_call_dp_for_het_call is_missing = va.logical_or(is_missing, high_dp) is_het = _call_is_het(variations, is_missing=is_missing) return (va.sum(is_het, axis=axis), va.sum(va.logical_not(is_missing), axis=axis))
def gts_as_mat012(gts): '''It transforms the GT matrix into 0 (major allele h**o), 1 (het), 2(other hom)''' gts012 = va.sum(gts, axis=2) gts012[va.any(gts == MISSING_INT, axis=2)] = MISSING_INT gts012[gts012 >= 1 ] = 2 gts012[va.logical_and(gts012 == 2, va.any(gts == 0, axis=2))] = 1 return gts012
def calc_maf_by_gt(variations, max_alleles, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): gts = variations[GT_FIELD] allele_counts_by_snp = count_alleles(gts, max_alleles, count_missing=False) max_ = va.max(allele_counts_by_snp, axis=1) sum_ = va.sum(allele_counts_by_snp, axis=1) with numpy.errstate(invalid='ignore'): mafs = max_ / sum_ return _mask_stats_with_few_samples(mafs, variations, min_num_genotypes)
def hmean(array, axis=0, dtype=None): if axis is None: array = array.ravel() size = array.shape[0] else: size = array.shape[axis] with np.errstate(divide='ignore'): inverse_mean = va.sum(1.0 / array, axis=axis, dtype=dtype) is_inf = va.logical_not(va.isfinite(inverse_mean)) hmean = size / inverse_mean hmean[is_inf] = np.nan return hmean
def calc_maf_by_allele_count(variations, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): ro = variations[RO_FIELD] ao = variations[AO_FIELD] ro[ro == MISSING_INT] = 0 ao[ao == MISSING_INT] = 0 ro_sum = va.sum(ro, axis=1) ao_sum = va.sum(ao, axis=1) max_ = va.sum(ao, axis=1).max(axis=1) sum_ = ao_sum.sum(axis=1) + ro_sum # we modify the max_ to update the values that are bigger in ro # here we have a setter that works different in numpy and dask va.assign_with_mask(array=max_, using=ro_sum, mask=ro_sum > max_) with numpy.errstate(invalid='ignore'): mafs = max_ / sum_ return _mask_stats_with_few_samples(mafs, variations, min_num_genotypes)
def calc_allele_freq(variations, max_alleles, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): gts = variations[GT_FIELD] if gts.shape[0] == 0: return va.empty_array(variations) allele_counts = count_alleles(gts, max_alleles, count_missing=False) if allele_counts is None: raise ValueError('No alleles, everything is missing data') total_counts = va.sum(allele_counts, axis=1) with numpy.errstate(invalid='ignore'): allele_freq = allele_counts / total_counts[:, None] allele_freq = _mask_stats_with_few_samples( allele_freq, variations, min_num_genotypes) return allele_freq
def _calc_allele_freq_and_unbiased_J_per_locus(variations, max_alleles, min_num_genotypes): try: allele_freq = calc_allele_freq(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) except ValueError: allele_freq = None xUb_per_locus = None if allele_freq is not None: n_indi = variations[GT_FIELD].shape[1] xUb_per_locus = ((2 * n_indi * va.sum(allele_freq**2, axis=1)) - 1) / (2 * n_indi - 1) return allele_freq, xUb_per_locus
def calc_expected_het(variations, max_alleles, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): try: allele_freq = calc_allele_freq(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) except ValueError: exp_het = va.create_not_initialized_array_in_memory((variations.num_variations,)) exp_het[:] = numpy.nan return exp_het if allele_freq.shape[0] == 0: return va.empty_array(variations) gts = variations[GT_FIELD] ploidy = gts.shape[2] exp_het = 1 - va.sum(allele_freq ** ploidy, axis=1) return exp_het
def _calc_j_stats_per_locus(variations1, variations2, max_alleles, min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT): res = _calc_allele_freq_and_unbiased_J_per_locus( variations1, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) allele_freq1, xUb_per_locus = res res = _calc_allele_freq_and_unbiased_J_per_locus( variations2, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) allele_freq2, yUb_per_locus = res if allele_freq2 is None or allele_freq1 is None: return None, None, None Jxy_per_locus = va.sum(allele_freq1 * allele_freq2, axis=1) return xUb_per_locus, yUb_per_locus, Jxy_per_locus
def _calc_pairwise_dest(vars_for_pop1, vars_for_pop2, max_alleles, min_call_dp_for_het, min_num_genotypes): num_pops = 2 ploidy = vars_for_pop1.ploidy allele_freq1 = calc_allele_freq(vars_for_pop1, max_alleles=max_alleles, min_num_genotypes=0) allele_freq2 = calc_allele_freq(vars_for_pop2, max_alleles=max_alleles, min_num_genotypes=0) exp_het1 = 1 - va.sum(allele_freq1**ploidy, axis=1) exp_het2 = 1 - va.sum(allele_freq2**ploidy, axis=1) hs_per_var = (exp_het1 + exp_het2) / 2 global_allele_freq = (allele_freq1 + allele_freq2) / 2 global_exp_het = 1 - va.sum(global_allele_freq**ploidy, axis=1) ht_per_var = global_exp_het obs_het1_counts, called_gts1 = _calc_obs_het_counts( vars_for_pop1, axis=1, min_call_dp_for_het_call=min_call_dp_for_het) obs_het1 = obs_het1_counts / called_gts1 obs_het2_counts, called_gts2 = _calc_obs_het_counts( vars_for_pop2, axis=1, min_call_dp_for_het_call=min_call_dp_for_het) obs_het2 = obs_het2_counts / called_gts2 called_gts = va.stack([called_gts1, called_gts2], as_type_of=called_gts1) try: called_gts_hmean = hmean(called_gts, axis=0) except ValueError: called_gts_hmean = None if called_gts_hmean is None: num_vars = vars_for_pop1.num_variations corrected_hs = va.full((num_vars, ), np.nan, as_type_of=vars_for_pop1[GT_FIELD]) corrected_ht = va.full((num_vars, ), np.nan, as_type_of=vars_for_pop1[GT_FIELD]) else: mean_obs_het_per_var = va.nanmean(va.stack([obs_het1, obs_het2], as_type_of=obs_het1), axis=0) corrected_hs = (called_gts_hmean / (called_gts_hmean - 1)) * (hs_per_var - (mean_obs_het_per_var / (2 * called_gts_hmean))) corrected_ht = ht_per_var + (corrected_hs / (called_gts_hmean * num_pops)) - ( mean_obs_het_per_var / (2 * called_gts_hmean * num_pops)) not_enough_gts = va.logical_or(called_gts1 < min_num_genotypes, called_gts2 < min_num_genotypes) corrected_hs[not_enough_gts] = np.nan corrected_ht[not_enough_gts] = np.nan return {'corrected_hs': corrected_hs, 'corrected_ht': corrected_ht}