Example #1
0
    def test_kosman_missing_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_ab = _kosman(vars1, vars2)

        c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1],
                      [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                      [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_cd = _kosman(vars1, vars2)

        assert np.all(distance_ab == distance_cd)
Example #2
0
    def test_calc_missing(self):
        variations = create_non_materialized_snp_filtered_variations()
        variations = keep_samples(variations, samples=['pepo',
                                                       'upv196'])[FLT_VARS]
        task = calc_missing_gt(variations, rates=False)
        result = compute({'num_missing_gts': task})
        self.assertTrue(
            np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1]))
        variations = create_non_materialized_snp_filtered_variations()
        variations = keep_samples(variations, samples=['pepo',
                                                       'upv196'])[FLT_VARS]
        task = calc_missing_gt(variations, rates=True)
        result = compute({'num_missing_gts': task})
        expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5]
        for a, b in zip(result['num_missing_gts'], expected):
            self.assertAlmostEqual(a, b, places=2)

        variations = create_dask_variations()
        task = calc_missing_gt_per_sample(variations, rates=True)
        result = compute({'num_missing_gts': task})

        variations = create_non_materialized_snp_filtered_variations()
        try:
            task = calc_missing_gt_per_sample(variations, rates=True)
            self.fail('NotMaterializedError expected')
        except NotMaterializedError:
            pass

        variations = create_dask_variations()
        task = calc_missing_gt_per_sample(variations, rates=False)
        result = compute({'num_missing_gts': task})
Example #3
0
 def test_calc_missing(self):
     variations = _create_dask_variations()
     variations = keep_samples(variations, samples=['pepo',
                                                    'upv196'])[FLT_VARS]
     task = calc_missing_gt(variations, rates=False)
     result = compute({'num_missing_gts': task})
     self.assertTrue(
         np.array_equal(result['num_missing_gts'], [1, 1, 1, 0, 2, 2, 1]))
     variations = _create_dask_variations()
     variations = keep_samples(variations, samples=['pepo',
                                                    'upv196'])[FLT_VARS]
     task = calc_missing_gt(variations, rates=True)
     result = compute({'num_missing_gts': task})
     expected = [0.5, 0.5, 0.5, 0, 1, 1, 0.5]
     for a, b in zip(result['num_missing_gts'], expected):
         self.assertAlmostEqual(a, b, places=2)
Example #4
0
    def test_keep_samples_in_memory(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        #         print(variations.samples.compute())
        #         print(variations[DP_FIELD].compute())
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        samples = ['upv196', 'pepo']
        processed = keep_samples(variations, samples=samples)
        dps = processed[FLT_VARS][DP_FIELD]

        self.assertTrue(
            np.all(processed[FLT_VARS].samples == ['pepo', 'upv196']))
        expected = [[-1, 9], [-1, 8], [-1, 8], [14, 6], [-1, -1], [-1, -1],
                    [-1, 6]]
        self.assertTrue(np.all(dps == expected))
Example #5
0
    def test_keep_samples_in_memory(self):
        variations = create_dask_variations()
        #         print(variations.samples.compute())
        #         print(variations[DP_FIELD].compute())
        variations = compute({'vars': variations},
                             store_variation_to_memory=True)['vars']
        samples = ['upv196', 'pepo']
        processed = keep_samples(variations, samples=samples)
        dps = processed[FLT_VARS][DP_FIELD]

        self.assertTrue(
            np.all(processed[FLT_VARS].samples == ['upv196', 'pepo']))
        expected = [[9, -1], [8, -1], [8, -1], [6, 14], [-1, -1], [-1, -1],
                    [6, -1]]
        self.assertTrue(np.all(dps == expected))
Example #6
0
    def test_kosman_missing(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)

        c = np.array([[-1, -1], [-1, -1], [0, 1], [0, 0], [0, 0], [0, 1],
                      [0, 1], [0, 1], [0, 0], [0, 0], [0, 1]])
        d = np.array([[-1, -1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1],
                      [1, 0], [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_cd = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)

        assert np.all(distance_ab == distance_cd)
Example #7
0
    def test_kosman_2_indis(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]

        assert distance == 1 / 3

        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0

        variations = Variations()
        gts = np.stack((b, d), axis=1)
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = da.from_array(samples)
        variations[GT_FIELD] = da.from_array(gts)

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        snp_by_snp_compartion_array = _kosman(vars1, vars2)
        distance_ab = compute(snp_by_snp_compartion_array,
                              silence_runtime_warnings=True)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0.45
Example #8
0
def calc_kosman_dist(variations,
                     min_num_snps=None,
                     silence_runtime_warning=False):
    variations_by_sample = OrderedDict()

    samples = va.make_sure_array_is_in_memory(
        variations.samples, silence_runtime_warnings=silence_runtime_warning)
    for sample in samples:
        variations_by_sample[sample] = keep_samples(variations,
                                                    [sample])[FLT_VARS]

    sample_combinations = combinations(samples, 2)

    distances_by_pair = OrderedDict()
    for sample1, sample2 in sample_combinations:
        vars1 = variations_by_sample[sample1]
        vars2 = variations_by_sample[sample2]

        snp_by_snp_comparation_array = _kosman(vars1, vars2)
        distances_by_pair[(sample1, sample2)] = snp_by_snp_comparation_array

    computed_distances_by_pair = compute(
        distances_by_pair, silence_runtime_warnings=silence_runtime_warning)

    distances = []
    for sample_index, sample in enumerate(samples):
        starting_index2 = sample_index + 1
        if starting_index2 >= len(samples):
            break
        for sample2 in samples[starting_index2:]:
            result = computed_distances_by_pair[(sample, sample2)]
            n_snps = result.shape[0]

            if min_num_snps is not None and n_snps < min_num_snps:
                value = 0.0
            else:
                with np.errstate(invalid='ignore'):
                    value = np.sum(result) / result.shape[0]
            distances.append(value)
    return distances, samples
Example #9
0
    def test_kosman_2_indis_in_memory(self):
        a = np.array([[-1, -1], [0, 0], [0, 1], [0, 0], [0, 0], [0, 1], [0, 1],
                      [0, 1], [0, 0], [0, 0], [0, 1]])
        b = np.array([[1, 1], [-1, -1], [0, 0], [0, 0], [1, 1], [0, 1], [1, 0],
                      [1, 0], [1, 0], [0, 1], [1, 1]])
        gts = np.stack((a, b), axis=1)
        variations = Variations()

        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]

        distance_ab = _kosman(vars1, vars2)

        va.make_sure_array_is_in_memory(distance_ab)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 1 / 3

        c = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        d = np.full(shape=(11, 2), fill_value=1, dtype=np.int16)
        gts = np.stack((c, d), axis=1)
        variations = Variations()
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        distance_ab = _kosman(vars1, vars2)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0

        variations = Variations()
        gts = np.stack((b, d), axis=1)
        samples = np.array([str(i) for i in range(gts.shape[1])])
        variations.samples = samples
        variations[GT_FIELD] = gts

        vars1 = keep_samples(variations, ['0'])[FLT_VARS]
        vars2 = keep_samples(variations, ['1'])[FLT_VARS]
        distance_ab = _kosman(vars1, vars2)
        distance = distance_ab.sum() / distance_ab.shape[0]
        assert distance == 0.45
Example #10
0
def calc_pop_pairwise_unbiased_nei_dists(
        variations,
        max_alleles,
        populations,
        silence_runtime_warnings=False,
        min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT):
    pop_ids = list(range(len(populations)))
    variations_per_pop = [
        keep_samples(variations, pop_samples)[FLT_VARS]
        for pop_samples in populations
    ]

    Jxy = {}
    uJx = {}
    uJy = {}
    for pop_id1, pop_id2 in combinations(pop_ids, 2):
        if pop_id1 not in Jxy:
            Jxy[pop_id1] = {}
        if pop_id1 not in uJx:
            uJx[pop_id1] = {}
        if pop_id1 not in uJy:
            uJy[pop_id1] = {}

        Jxy[pop_id1][pop_id2] = None
        uJx[pop_id1][pop_id2] = None
        uJy[pop_id1][pop_id2] = None

    for pop_id1, pop_id2 in combinations(pop_ids, 2):
        vars_for_pop1 = variations_per_pop[pop_id1]
        vars_for_pop2 = variations_per_pop[pop_id2]
        _accumulate_j_stats(vars_for_pop1,
                            vars_for_pop2,
                            max_alleles,
                            Jxy,
                            uJx,
                            uJy,
                            pop_id1,
                            pop_id2,
                            min_num_genotypes=min_num_genotypes)
    computed_result = compute(
        {
            'Jxy': Jxy,
            'uJx': uJx,
            'uJy': uJy
        },
        silence_runtime_warnings=silence_runtime_warnings)

    computedJxy = computed_result['Jxy']
    computeduJx = computed_result['uJx']
    computeduJy = computed_result['uJy']

    n_pops = len(populations)
    dists = np.empty(int((n_pops**2 - n_pops) / 2))
    dists[:] = np.nan
    for idx, (pop_id1, pop_id2) in enumerate(combinations(pop_ids, 2)):
        if Jxy[pop_id1][pop_id2] is None:
            unbiased_nei_identity = math.nan
        else:
            with np.errstate(invalid='ignore'):
                unbiased_nei_identity = computedJxy[pop_id1][
                    pop_id2] / math.sqrt(computeduJx[pop_id1][pop_id2] *
                                         computeduJy[pop_id1][pop_id2])
        nei_unbiased_distance = -math.log(unbiased_nei_identity)
        if nei_unbiased_distance < 0:
            nei_unbiased_distance = 0
        dists[idx] = nei_unbiased_distance
    return dists
Example #11
0
def calc_dset_pop_distance(variations,
                           max_alleles,
                           populations,
                           min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                           min_call_dp_for_het=0,
                           silence_runtime_warnings=False):
    '''This is an implementation of the formulas proposed in GenAlex'''
    pop_ids = list(range(len(populations)))
    variations_per_pop = [
        keep_samples(variations, pop_samples)[FLT_VARS]
        for pop_samples in populations
    ]

    accumulated_dists = {}
    accumulated_hs = {}
    accumulated_ht = {}
    num_vars = {}

    for pop_id1, pop_id2 in combinations(pop_ids, 2):
        vars_for_pop1 = variations_per_pop[pop_id1]
        vars_for_pop2 = variations_per_pop[pop_id2]

        res = _calc_pairwise_dest(vars_for_pop1,
                                  vars_for_pop2,
                                  max_alleles=max_alleles,
                                  min_call_dp_for_het=min_call_dp_for_het,
                                  min_num_genotypes=min_num_genotypes)

        res['corrected_hs']
        res['corrected_ht']
        num_vars_in_chunk = va.count_nonzero(~va.isnan(res['corrected_hs']))

        hs_in_chunk = va.nansum(res['corrected_hs'])
        ht_in_chunk = va.nansum(res['corrected_ht'])

        key = (pop_id1, pop_id2)
        if key in accumulated_dists:
            accumulated_hs[key] += hs_in_chunk
            accumulated_ht[key] += ht_in_chunk
            num_vars[key] += num_vars_in_chunk
        else:
            accumulated_hs[key] = hs_in_chunk
            accumulated_ht[key] = ht_in_chunk
            num_vars[key] = num_vars_in_chunk

    task = {
        'accumulated_hs': accumulated_hs,
        'accumulated_ht': accumulated_ht,
        'num_vars': num_vars
    }

    result = compute(task, silence_runtime_warnings=silence_runtime_warnings)
    computed_accumulated_hs = result['accumulated_hs']
    computed_accumulated_ht = result['accumulated_ht']
    computed_num_vars = result['num_vars']

    tot_n_pops = len(populations)
    dists = np.empty(int((tot_n_pops**2 - tot_n_pops) / 2))
    dists[:] = np.nan
    num_pops = 2
    for idx, (pop_id1, pop_id2) in enumerate(combinations(pop_ids, 2)):
        key = pop_id1, pop_id2
        if key in accumulated_hs:
            with np.errstate(invalid='ignore'):
                corrected_hs = computed_accumulated_hs[
                    key] / computed_num_vars[key]
                corrected_ht = computed_accumulated_ht[
                    key] / computed_num_vars[key]
            dest = (num_pops /
                    (num_pops - 1)) * ((corrected_ht - corrected_hs) /
                                       (1 - corrected_hs))
        else:
            dest = np.nan
        dists[idx] = dest
    return dists
Example #12
0
def calc_pop_pairwise_nei_dists_by_depth(variations,
                                         populations,
                                         silence_runtime_warnings=False):

    variations_per_pop = [
        keep_samples(variations, pop_samples)[FLT_VARS]
        for pop_samples in populations
    ]

    jxy = {}
    jxx = {}
    jyy = {}
    for pop_i, pop_j in combinations(range(len(populations)), 2):
        pop_i_vars = variations_per_pop[pop_i]
        pop_j_vars = variations_per_pop[pop_j]

        freq_al_i = calc_allele_freq_by_depth(pop_i_vars)
        freq_al_j = calc_allele_freq_by_depth(pop_j_vars)

        chunk_jxy = va.nansum(freq_al_i * freq_al_j)
        chunk_jxx = va.nansum(freq_al_i**2)
        chunk_jyy = va.nansum(freq_al_j**2)

        pop_idx = pop_i, pop_j
        if pop_idx not in jxy:
            jxy[pop_idx] = 0
            jxx[pop_idx] = 0
            jyy[pop_idx] = 0

        # The real Jxy is usually divided by num_snps, but it does not
        # not matter for the calculation
        jxy[pop_idx] += chunk_jxy
        jxx[pop_idx] += chunk_jxx
        jyy[pop_idx] += chunk_jyy

    computed_result = compute(
        {
            'jxy': jxy,
            'jxx': jxx,
            'uJy': jyy
        },
        silence_runtime_warnings=silence_runtime_warnings)

    computedjxy = computed_result['jxy']
    computedjxx = computed_result['jxx']
    computedjyy = computed_result['jyy']

    n_pops = len(populations)
    dists = np.zeros(int((n_pops**2 - n_pops) / 2))
    index = 0
    for pop_idx in combinations(range(len(populations)), 2):
        pjxy = computedjxy[pop_idx]
        pjxx = computedjxx[pop_idx]
        pjyy = computedjyy[pop_idx]

        try:
            nei = math.log(pjxy / math.sqrt(pjxx * pjyy))
            if nei != 0:
                nei = -nei
        except ValueError:
            nei = float('inf')

        dists[index] = nei
        index += 1

    return dists