def _calc_ld_between_variations(variations1, variations2, min_num_gts=10, max_maf=0.95): max_alleles = variations1[ALT_FIELD].shape[1] maf1 = calc_maf_by_gt(variations1, max_alleles=max_alleles, min_num_genotypes=min_num_gts) maf2 = calc_maf_by_gt(variations2, max_alleles=max_alleles, min_num_genotypes=min_num_gts) if (np.any(np.isnan(maf1)) or np.any(maf1 > max_maf) or np.any(np.isnan(maf2)) or np.any(maf2 > max_maf)): msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf' raise RuntimeError(msg) lds_for_pair = calc_rogers_huff_r(va.gts_as_mat012(variations1[GT_FIELD]), va.gts_as_mat012(variations2[GT_FIELD]), min_num_gts=min_num_gts) pos1 = variations1[POS_FIELD] pos2 = variations2[POS_FIELD] pos1_repeated = np.repeat(pos1, pos2.size).reshape((pos1.size, pos2.size)) pos2_repeated = np.tile(pos2, pos1.size).reshape((pos1.size, pos2.size)) physical_dist = np.abs(pos1_repeated - pos2_repeated).astype(float) assert lds_for_pair.shape == physical_dist.shape chrom1 = variations1[CHROM_FIELD] chrom2 = variations2[CHROM_FIELD] chrom1_repeated = np.repeat(chrom1, chrom2.size).reshape((chrom1.size, chrom2.size)) chrom2_repeated = np.tile(chrom2, chrom1.size).reshape((chrom1.size, chrom2.size)) physical_dist[chrom1_repeated != chrom2_repeated] = np.nan positions = list(zip(chrom1_repeated.flat, pos1_repeated.flat, chrom2_repeated.flat, pos2_repeated.flat)) yield zip(lds_for_pair.flat, physical_dist.flat, positions)
def filter_by_maf(variations, max_alleles, max_allowable_maf=None, min_allowable_maf=None, filter_id='filter_by_maf', min_num_genotypes=MIN_NUM_GENOTYPES_FOR_POP_STAT, calc_histogram=False, n_bins=DEF_NUM_BINS, limits=None): mafs = calc_maf_by_gt(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) result = _select_vars(variations, mafs, min_allowable_maf, max_allowable_maf) if calc_histogram: if limits is None: limits = (0, 1) counts, bin_edges = va.histogram(mafs, n_bins=n_bins, limits=limits) result[FLT_STATS][COUNT] = counts result[FLT_STATS][BIN_EDGES] = bin_edges limits = [] if min_allowable_maf is not None: limits.append(min_allowable_maf) if max_allowable_maf is not None: limits.append(max_allowable_maf) result[FLT_STATS]['limits'] = limits return { FLT_VARS: result[FLT_VARS], FLT_ID: filter_id, FLT_STATS: result[FLT_STATS] }
def test_calc_maf_by_gt2(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0) # res = compute(mafs, silence_runtime_warnings=True) counts, edges = va.histogram(mafs, n_bins=5, limits=(0, 1)) cc = compute({ 'counts': counts, 'edges': edges }, silence_runtime_warnings=True) self.assertTrue(np.all(cc['counts'] == [0, 0, 4, 2, 0])) self.assertTrue( np.all(np.isclose(cc['edges'], [0, 0.2, 0.4, 0.6, 0.8, 1])))
def test_calc_maf_by_gt_in_memory(self): variations = Variations(samples=np.array(['aa', 'bb'])) gts = np.array([[[0, 2], [-1, -1]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = gts # , chunks=(2, 1, 2)) mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0) expected = [0.5, 0.33333333, 0.5, math.nan] for a, b in zip(mafs, expected): if math.isnan(a): self.assertTrue(math.isnan(b)) continue self.assertAlmostEqual(a, b, places=2)
def calc_ld_random_pairs_from_different_chroms(variations, num_pairs, max_maf=0.95, min_num_gts=10, silence_runtime_warnings=False): chroms = va.make_sure_array_is_in_memory(variations[CHROM_FIELD], silence_runtime_warnings=silence_runtime_warnings) different_chroms = np.unique(chroms) if different_chroms.size < 2: raise ValueError('Only one chrom in variations') max_alleles = variations[ALT_FIELD].shape[1] mafs = calc_maf_by_gt(variations, max_alleles, min_num_gts) mafs = va.make_sure_array_is_in_memory(mafs, silence_runtime_warnings=silence_runtime_warnings) if va.any(va.isnan(mafs)) or va.any(mafs > max_maf): msg = 'Not enough genotypes or MAF below allowed maximum, Rogers Huff calculations known to go wrong for very high maf' raise RuntimeError(msg) gts = va.make_sure_array_is_in_memory(variations[GT_FIELD], silence_runtime_warnings=silence_runtime_warnings) num_variations = gts.shape[0] pairs_computed = 0 while True: snp_idx1 = random.randrange(num_variations) snp_idx2 = random.randrange(num_variations) chrom1 = chroms[snp_idx1] chrom2 = chroms[snp_idx2] if chrom1 == chrom2: continue gts_snp1 = gts[snp_idx1] gts_snp2 = gts[snp_idx2] r2_ld = _calc_rogers_huff_r_for_snp_pair(gts_snp1, gts_snp2, min_num_gts=min_num_gts) if not math.isnan(r2_ld): yield chrom1, snp_idx1, chrom2, snp_idx2, r2_ld pairs_computed += 1 if pairs_computed >= num_pairs: break
def test_calc_maf_by_gt(self): variations = Variations(samples=da.array(['aa', 'bb'])) gts = np.array([[[0, 2], [-1, -1]], [[0, 2], [1, -1]], [[0, 0], [1, 1]], [[-1, -1], [-1, -1]]]) variations[GT_FIELD] = da.from_array(gts) # , chunks=(2, 1, 2)) # with this step we create a variation with dask arrays of unknown # shapes variations = remove_low_call_rate_vars(variations, 0)[FLT_VARS] mafs = calc_maf_by_gt(variations, max_alleles=3, min_num_genotypes=0) result = compute(mafs, silence_runtime_warnings=True) expected = [0.5, 0.33333333, 0.5, math.nan] for a, b in zip(result, expected): if math.isnan(a): self.assertTrue(math.isnan(b)) continue self.assertAlmostEqual(a, b, places=2)