Ejemplo n.º 1
0
def _calc_ld_between_variations(variations1, variations2, min_num_gts=10,
                                max_maf=0.95):
    max_alleles = variations1[ALT_FIELD].shape[1]
    maf1 = calc_maf_by_gt(variations1, max_alleles=max_alleles, min_num_genotypes=min_num_gts)
    maf2 = calc_maf_by_gt(variations2, max_alleles=max_alleles, min_num_genotypes=min_num_gts)

    if (np.any(np.isnan(maf1)) or np.any(maf1 > max_maf) or
        np.any(np.isnan(maf2)) or np.any(maf2 > max_maf)):
        msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf'
        raise RuntimeError(msg)

    lds_for_pair = calc_rogers_huff_r(va.gts_as_mat012(variations1[GT_FIELD]),
                                      va.gts_as_mat012(variations2[GT_FIELD]),
                                      min_num_gts=min_num_gts)
    pos1 = variations1[POS_FIELD]
    pos2 = variations2[POS_FIELD]

    pos1_repeated = np.repeat(pos1, pos2.size).reshape((pos1.size, pos2.size))
    pos2_repeated = np.tile(pos2, pos1.size).reshape((pos1.size, pos2.size))
    physical_dist = np.abs(pos1_repeated - pos2_repeated).astype(float)
    assert lds_for_pair.shape == physical_dist.shape

    chrom1 = variations1[CHROM_FIELD]
    chrom2 = variations2[CHROM_FIELD]
    chrom1_repeated = np.repeat(chrom1, chrom2.size).reshape((chrom1.size, chrom2.size))
    chrom2_repeated = np.tile(chrom2, chrom1.size).reshape((chrom1.size, chrom2.size))

    physical_dist[chrom1_repeated != chrom2_repeated] = np.nan

    positions = list(zip(chrom1_repeated.flat, pos1_repeated.flat,
                         chrom2_repeated.flat, pos2_repeated.flat))
    yield zip(lds_for_pair.flat, physical_dist.flat, positions)
Ejemplo n.º 2
0
def filter_by_maf(variations,
                  max_alleles,
                  max_allowable_maf=None,
                  min_allowable_maf=None,
                  filter_id='filter_by_maf',
                  min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT,
                  calc_histogram=False,
                  n_bins=DEF_NUM_BINS,
                  limits=None):
    mafs = calc_maf_by_gt(variations,
                          max_alleles=max_alleles,
                          min_num_genotypes=min_num_genotypes)

    result = _select_vars(variations, mafs, min_allowable_maf,
                          max_allowable_maf)

    if calc_histogram:
        if limits is None:
            limits = (0, 1)
        counts, bin_edges = va.histogram(mafs, n_bins=n_bins, limits=limits)
        result[FLT_STATS][COUNT] = counts
        result[FLT_STATS][BIN_EDGES] = bin_edges
        limits = []
        if min_allowable_maf is not None:
            limits.append(min_allowable_maf)
        if max_allowable_maf is not None:
            limits.append(max_allowable_maf)
        result[FLT_STATS]['limits'] = limits

    return {
        FLT_VARS: result[FLT_VARS],
        FLT_ID: filter_id,
        FLT_STATS: result[FLT_STATS]
    }
Ejemplo n.º 3
0
    def test_calc_maf_by_gt2(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0)

        # res = compute(mafs, silence_runtime_warnings=True)
        counts, edges = va.histogram(mafs, n_bins=5, limits=(0, 1))
        cc = compute({
            'counts': counts,
            'edges': edges
        },
                     silence_runtime_warnings=True)
        self.assertTrue(np.all(cc['counts'] == [0, 0, 4, 2, 0]))
        self.assertTrue(
            np.all(np.isclose(cc['edges'], [0, 0.2, 0.4, 0.6, 0.8, 1])))
Ejemplo n.º 4
0
    def test_calc_maf_by_gt_in_memory(self):
        variations = Variations(samples=np.array(['aa', 'bb']))

        gts = np.array([[[0, 2], [-1, -1]], [[0, 2], [1, -1]], [[0, 0], [1,
                                                                         1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = gts  # , chunks=(2, 1, 2))

        mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0)

        expected = [0.5, 0.33333333, 0.5, math.nan]
        for a, b in zip(mafs, expected):
            if math.isnan(a):
                self.assertTrue(math.isnan(b))
                continue
            self.assertAlmostEqual(a, b, places=2)
Ejemplo n.º 5
0
def calc_ld_random_pairs_from_different_chroms(variations, num_pairs,
                                               max_maf=0.95, min_num_gts=10,
                                               silence_runtime_warnings=False):
    chroms = va.make_sure_array_is_in_memory(variations[CHROM_FIELD],
        silence_runtime_warnings=silence_runtime_warnings)

    different_chroms = np.unique(chroms)
    if different_chroms.size < 2:
        raise ValueError('Only one chrom in variations')
    max_alleles = variations[ALT_FIELD].shape[1]

    mafs = calc_maf_by_gt(variations, max_alleles, min_num_gts)
    mafs = va.make_sure_array_is_in_memory(mafs,
        silence_runtime_warnings=silence_runtime_warnings)

    if va.any(va.isnan(mafs)) or va.any(mafs > max_maf):
        msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf'
        raise RuntimeError(msg)

    gts = va.make_sure_array_is_in_memory(variations[GT_FIELD],
        silence_runtime_warnings=silence_runtime_warnings)

    num_variations = gts.shape[0]

    pairs_computed = 0
    while True:
        snp_idx1 = random.randrange(num_variations)
        snp_idx2 = random.randrange(num_variations)
        chrom1 = chroms[snp_idx1]
        chrom2 = chroms[snp_idx2]
        if chrom1 == chrom2:
            continue

        gts_snp1 = gts[snp_idx1]
        gts_snp2 = gts[snp_idx2]
        r2_ld = _calc_rogers_huff_r_for_snp_pair(gts_snp1, gts_snp2,
                                                 min_num_gts=min_num_gts)
        if not math.isnan(r2_ld):
            yield chrom1, snp_idx1, chrom2, snp_idx2, r2_ld
            pairs_computed += 1

        if pairs_computed >= num_pairs:
            break
Ejemplo n.º 6
0
    def test_calc_maf_by_gt(self):
        variations = Variations(samples=da.array(['aa', 'bb']))

        gts = np.array([[[0, 2], [-1, -1]], [[0, 2], [1, -1]], [[0, 0], [1,
                                                                         1]],
                        [[-1, -1], [-1, -1]]])
        variations[GT_FIELD] = da.from_array(gts)  # , chunks=(2, 1, 2))
        # with this step we create a  variation with dask arrays of unknown
        # shapes
        variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS]

        mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0)
        result = compute(mafs, silence_runtime_warnings=True)

        expected = [0.5, 0.33333333, 0.5, math.nan]
        for a, b in zip(result, expected):
            if math.isnan(a):
                self.assertTrue(math.isnan(b))
                continue
            self.assertAlmostEqual(a, b, places=2)