Ejemplo n.º 1
0
    def test_sum(self):
        np_array = np.array([1, 2, 3, 4, 5])
        self.assertEqual(va.sum(np_array), 15)

        da_array = da.from_array(np_array)
        task = va.sum(da_array)
        self.assertEqual(task.compute(), 15)
Ejemplo n.º 2
0
def calc_diversities(variations,
                     max_alleles,
                     min_num_genotypes,
                     min_call_dp_for_het_call=MIN_DP_FOR_CALL_HET,
                     polymorphic_threshold=0.95):
    diversities = {}

    mafs = calc_maf_by_gt(variations,
                          max_alleles,
                          min_num_genotypes=min_num_genotypes)

    mafs_no_nan = mafs[va.logical_not(va.isnan(mafs))]

    num_variable_vars = va.sum(mafs_no_nan < 0.9999999999)

    diversities['num_variable_vars'] = num_variable_vars

    snp_is_poly = mafs_no_nan <= polymorphic_threshold
    num_poly = va.sum(snp_is_poly)
    diversities['num_polymorphic_vars'] = num_poly

    exp_het = calc_expected_het(variations,
                                max_alleles=max_alleles,
                                min_num_genotypes=min_num_genotypes)
    diversities['exp_het'] = va.nanmean(exp_het)

    obs_het = calc_obs_het(variations,
                           min_call_dp_for_het_call=min_call_dp_for_het_call,
                           min_num_genotypes=min_num_genotypes)
    diversities['obs_het'] = va.nanmean(obs_het)
    diversities['num_total_variations'] = variations.num_variations
    return diversities
Ejemplo n.º 3
0
def calc_allele_freq_by_depth(variations):
    allele_counts = variations[AD_FIELD]
    allele_counts[allele_counts == -1] = 0
    allele_counts = va.sum(allele_counts, axis=1)
    total_counts = va.sum(allele_counts, axis=1)
    allele_freq = allele_counts / total_counts[:, None]
    return allele_freq
Ejemplo n.º 4
0
def keep_variable_variations(variations,
                             max_alleles,
                             filter_id='variable_variations'):
    gts = variations[GT_FIELD]
    some_not_missing_gts = va.any(gts != MISSING_INT, axis=2)
    selected_vars1 = va.any(some_not_missing_gts, axis=1)
    allele_counts = count_alleles(gts,
                                  max_alleles=max_alleles,
                                  count_missing=False)
    num_alleles_per_snp = va.sum(allele_counts > 0, axis=1)
    selected_vars2 = num_alleles_per_snp > 1

    selected_vars = va.logical_and(selected_vars1, selected_vars2)

    selected_variations = variations.get_vars(selected_vars)

    num_selected_vars = va.count_nonzero(selected_vars)
    num_filtered = va.count_nonzero(va.logical_not(selected_vars))

    flt_stats = {N_KEPT: num_selected_vars, N_FILTERED_OUT: num_filtered}

    return {
        FLT_VARS: selected_variations,
        FLT_ID: filter_id,
        FLT_STATS: flt_stats
    }
Ejemplo n.º 5
0
def _calc_obs_het_counts(variations, axis, min_call_dp_for_het_call,
                         max_call_dp_for_het_call=None):
    is_missing = va.any(variations[GT_FIELD] == MISSING_INT, axis=2)

    if min_call_dp_for_het_call is not None or max_call_dp_for_het_call is not None:
        dps = variations[DP_FIELD]
        if min_call_dp_for_het_call is not None:
            low_dp = dps < min_call_dp_for_het_call
            is_missing = va.logical_or(is_missing, low_dp)
        if max_call_dp_for_het_call is not None:
            high_dp = dps > max_call_dp_for_het_call
            is_missing = va.logical_or(is_missing, high_dp)
    is_het = _call_is_het(variations, is_missing=is_missing)

    return (va.sum(is_het, axis=axis),
            va.sum(va.logical_not(is_missing), axis=axis))
Ejemplo n.º 6
0
def gts_as_mat012(gts):
    '''It transforms the GT matrix into 0 (major allele h**o), 1 (het),
       2(other hom)'''
    gts012 = va.sum(gts, axis=2)
    gts012[va.any(gts == MISSING_INT, axis=2)] = MISSING_INT
    gts012[gts012 >= 1 ] = 2
    gts012[va.logical_and(gts012 == 2, va.any(gts == 0, axis=2))] = 1

    return gts012
Ejemplo n.º 7
0
def calc_maf_by_gt(variations, max_alleles,
                   min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    gts = variations[GT_FIELD]

    allele_counts_by_snp = count_alleles(gts, max_alleles, count_missing=False)
    max_ = va.max(allele_counts_by_snp, axis=1)
    sum_ = va.sum(allele_counts_by_snp, axis=1)

    with numpy.errstate(invalid='ignore'):
        mafs = max_ / sum_

    return _mask_stats_with_few_samples(mafs, variations, min_num_genotypes)
Ejemplo n.º 8
0
def hmean(array, axis=0, dtype=None):
    if axis is None:
        array = array.ravel()
        size = array.shape[0]
    else:
        size = array.shape[axis]
    with np.errstate(divide='ignore'):
        inverse_mean = va.sum(1.0 / array, axis=axis, dtype=dtype)
    is_inf = va.logical_not(va.isfinite(inverse_mean))
    hmean = size / inverse_mean
    hmean[is_inf] = np.nan

    return hmean
Ejemplo n.º 9
0
def calc_maf_by_allele_count(variations,
                             min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    ro = variations[RO_FIELD]
    ao = variations[AO_FIELD]

    ro[ro == MISSING_INT] = 0
    ao[ao == MISSING_INT] = 0

    ro_sum = va.sum(ro, axis=1)
    ao_sum = va.sum(ao, axis=1)

    max_ = va.sum(ao, axis=1).max(axis=1)

    sum_ = ao_sum.sum(axis=1) + ro_sum

    # we modify the max_ to update the values that are bigger in ro
    # here we have a setter that works different in numpy and dask
    va.assign_with_mask(array=max_, using=ro_sum, mask=ro_sum > max_)

    with numpy.errstate(invalid='ignore'):
        mafs = max_ / sum_

    return _mask_stats_with_few_samples(mafs, variations, min_num_genotypes)
Ejemplo n.º 10
0
def calc_allele_freq(variations, max_alleles,
                     min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):

    gts = variations[GT_FIELD]
    if gts.shape[0] == 0:
        return va.empty_array(variations)
    allele_counts = count_alleles(gts, max_alleles, count_missing=False)
    if allele_counts is None:
        raise ValueError('No alleles, everything is missing data')
    total_counts = va.sum(allele_counts, axis=1)
    with numpy.errstate(invalid='ignore'):
        allele_freq = allele_counts / total_counts[:, None]
    allele_freq = _mask_stats_with_few_samples(
        allele_freq, variations, min_num_genotypes)
    return allele_freq
Ejemplo n.º 11
0
def _calc_allele_freq_and_unbiased_J_per_locus(variations, max_alleles,
                                               min_num_genotypes):
    try:
        allele_freq = calc_allele_freq(variations,
                                       max_alleles=max_alleles,
                                       min_num_genotypes=min_num_genotypes)
    except ValueError:
        allele_freq = None
        xUb_per_locus = None

    if allele_freq is not None:
        n_indi = variations[GT_FIELD].shape[1]
        xUb_per_locus = ((2 * n_indi * va.sum(allele_freq**2, axis=1)) -
                         1) / (2 * n_indi - 1)

    return allele_freq, xUb_per_locus
Ejemplo n.º 12
0
def calc_expected_het(variations, max_alleles,
                      min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    try:
        allele_freq = calc_allele_freq(variations, max_alleles=max_alleles,
                                       min_num_genotypes=min_num_genotypes)
    except ValueError:
        exp_het = va.create_not_initialized_array_in_memory((variations.num_variations,))
        exp_het[:] = numpy.nan
        return exp_het
    if allele_freq.shape[0] == 0:
        return va.empty_array(variations)

    gts = variations[GT_FIELD]
    ploidy = gts.shape[2]
    exp_het = 1 - va.sum(allele_freq ** ploidy, axis=1)

    return exp_het
Ejemplo n.º 13
0
def _calc_j_stats_per_locus(variations1,
                            variations2,
                            max_alleles,
                            min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    res = _calc_allele_freq_and_unbiased_J_per_locus(
        variations1,
        max_alleles=max_alleles,
        min_num_genotypes=min_num_genotypes)
    allele_freq1, xUb_per_locus = res

    res = _calc_allele_freq_and_unbiased_J_per_locus(
        variations2,
        max_alleles=max_alleles,
        min_num_genotypes=min_num_genotypes)
    allele_freq2, yUb_per_locus = res

    if allele_freq2 is None or allele_freq1 is None:
        return None, None, None

    Jxy_per_locus = va.sum(allele_freq1 * allele_freq2, axis=1)

    return xUb_per_locus, yUb_per_locus, Jxy_per_locus
Ejemplo n.º 14
0
def _calc_pairwise_dest(vars_for_pop1, vars_for_pop2, max_alleles,
                        min_call_dp_for_het, min_num_genotypes):
    num_pops = 2
    ploidy = vars_for_pop1.ploidy

    allele_freq1 = calc_allele_freq(vars_for_pop1,
                                    max_alleles=max_alleles,
                                    min_num_genotypes=0)
    allele_freq2 = calc_allele_freq(vars_for_pop2,
                                    max_alleles=max_alleles,
                                    min_num_genotypes=0)

    exp_het1 = 1 - va.sum(allele_freq1**ploidy, axis=1)
    exp_het2 = 1 - va.sum(allele_freq2**ploidy, axis=1)

    hs_per_var = (exp_het1 + exp_het2) / 2

    global_allele_freq = (allele_freq1 + allele_freq2) / 2
    global_exp_het = 1 - va.sum(global_allele_freq**ploidy, axis=1)
    ht_per_var = global_exp_het

    obs_het1_counts, called_gts1 = _calc_obs_het_counts(
        vars_for_pop1, axis=1, min_call_dp_for_het_call=min_call_dp_for_het)
    obs_het1 = obs_het1_counts / called_gts1
    obs_het2_counts, called_gts2 = _calc_obs_het_counts(
        vars_for_pop2, axis=1, min_call_dp_for_het_call=min_call_dp_for_het)
    obs_het2 = obs_het2_counts / called_gts2

    called_gts = va.stack([called_gts1, called_gts2], as_type_of=called_gts1)

    try:
        called_gts_hmean = hmean(called_gts, axis=0)
    except ValueError:
        called_gts_hmean = None

    if called_gts_hmean is None:
        num_vars = vars_for_pop1.num_variations
        corrected_hs = va.full((num_vars, ),
                               np.nan,
                               as_type_of=vars_for_pop1[GT_FIELD])
        corrected_ht = va.full((num_vars, ),
                               np.nan,
                               as_type_of=vars_for_pop1[GT_FIELD])
    else:
        mean_obs_het_per_var = va.nanmean(va.stack([obs_het1, obs_het2],
                                                   as_type_of=obs_het1),
                                          axis=0)
        corrected_hs = (called_gts_hmean /
                        (called_gts_hmean - 1)) * (hs_per_var -
                                                   (mean_obs_het_per_var /
                                                    (2 * called_gts_hmean)))

        corrected_ht = ht_per_var + (corrected_hs /
                                     (called_gts_hmean * num_pops)) - (
                                         mean_obs_het_per_var /
                                         (2 * called_gts_hmean * num_pops))

        not_enough_gts = va.logical_or(called_gts1 < min_num_genotypes,
                                       called_gts2 < min_num_genotypes)
        corrected_hs[not_enough_gts] = np.nan
        corrected_ht[not_enough_gts] = np.nan

    return {'corrected_hs': corrected_hs, 'corrected_ht': corrected_ht}