def verify_max_distance(self, ts): """ Verifies that the max_distance parameter works as expected. """ mutations = list(ts.mutations()) ldc = msprime.LdCalculator(ts) A = ldc.get_r2_matrix() j = len(mutations) // 2 for k in range(j): x = mutations[j + k].position - mutations[j].position a = ldc.get_r2_array(j, max_distance=x) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a)) x = mutations[j].position - mutations[j - k].position a = ldc.get_r2_array(j, max_distance=x, direction=msprime.REVERSE) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j - k:j], a[::-1])) L = ts.get_sequence_length() m = len(mutations) a = ldc.get_r2_array(0, max_distance=L) self.assertEqual(a.shape[0], m - 1) self.assertTrue(np.allclose(A[0, 1:], a)) a = ldc.get_r2_array(m - 1, max_distance=L, direction=msprime.REVERSE) self.assertEqual(a.shape[0], m - 1) self.assertTrue(np.allclose(A[m - 1, :-1], a[::-1]))
def get_msprime_ld(sim_data): ld_calc = msprime.LdCalculator(sim_data) A = ld_calc.get_r2_matrix() plt.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues") plt.xticks([]) plt.yticks([]) plt.show()
def verify_matrix(self, ts): m = ts.get_num_sites() ldc = msprime.LdCalculator(ts) A = ldc.get_r2_matrix() self.assertEqual(A.shape, (m, m)) B = get_r2_matrix(ts) self.assertTrue(np.allclose(A, B)) # Now look at each row in turn, and verify it's the same # when we use get_r2 directly. for j in range(m): a = ldc.get_r2_array(j, direction=msprime.FORWARD) b = A[j, j + 1:] self.assertEqual(a.shape[0], m - j - 1) self.assertEqual(b.shape[0], m - j - 1) self.assertTrue(np.allclose(a, b)) a = ldc.get_r2_array(j, direction=msprime.REVERSE) b = A[j, :j] self.assertEqual(a.shape[0], j) self.assertEqual(b.shape[0], j) self.assertTrue(np.allclose(a[::-1], b)) # Now check every cell in the matrix in turn. for j in range(m): for k in range(m): self.assertAlmostEqual(ldc.get_r2(j, k), A[j, k])
def clump_variants(simulation, summary_stats, nhaps, r2_threshold, window_size): """ perform variant clumping in a greedy fasion with p-value and r2 threshold in windows return only those variants meeting some nominal threshold 1: make a dict of pos -> variant for subset of sites meeting criteria 2: make an r2 dict of all pairs of snps meeting p-value threshold and in same window """ # make a list of SNPs ordered by p-value eprint('Subsetting variants to usable list' + current_time()) usable_positions = {} # position -> variant (simulation indices) sim_pos_index = {} for variant in tqdm(simulation.variants(), total=simulation.get_num_mutations()): if variant.position in summary_stats: usable_positions[variant.position] = variant sim_pos_index[variant.position] = variant.index # order all snps by p-value ordered_positions = sorted(summary_stats.keys(), key=lambda x: summary_stats[x][-1]) #[(x, (x in usable_positions.keys())) for x in ordered_positions] eur_subset = simulation.subset(range(nhaps[0], (nhaps[0] + nhaps[1]))) eur_index_pos = {} eur_pos_index = {} for mutation in tqdm(eur_subset.mutations(), total=eur_subset.get_num_mutations()): eur_index_pos[mutation.index] = mutation.position eur_pos_index[mutation.position] = mutation.index ordered_eur_index = sorted(eur_index_pos.keys()) ld_calc = msprime.LdCalculator(eur_subset) #ld_calc = msprime.LdCalculator(simulation) # compute LD and prune in order of significance (popping index of SNPs) for position in ordered_positions: if position in usable_positions: r2_forward = ld_calc.get_r2_array(eur_pos_index[position], direction=msprime.FORWARD, max_distance=125e3) #print([position, np.where(r2_forward > r2_threshold)[0], np.where(r2_reverse > r2_threshold)[0]]) for i in np.where(r2_forward > r2_threshold)[0]: usable_positions.pop( eur_index_pos[eur_pos_index[position] + i + 1], None) #identify next position in eur space r2_reverse = ld_calc.get_r2_array(eur_pos_index[position], direction=msprime.REVERSE, max_distance=125e3) for i in np.where(r2_reverse > r2_threshold)[0]: usable_positions.pop( eur_index_pos[eur_pos_index[position] - i - 1], None) clumped_snps = set(usable_positions.keys()) eprint('Starting SNPs: ' + str(len(ordered_positions)) + '; SNPs after clumping: ' + str(len(clumped_snps)) + current_time()) return (clumped_snps, usable_positions)
def pos_r2(ts): """Obtain vectors of position differences and r^2 per pair of sites. Arguments --------- ts : msprime.TreeSequence tree sequence object Returns ------- pos_diff : np.array position difference for pairs of snps r2 : np.array r^2 as computed between the different sites """ ld_calc = msp.LdCalculator(ts) r2_est = ld_calc.r2_matrix() # Computing positions and indices pos = np.array([s.position for s in ts.sites()], dtype=np.float32) n_sites = ts.num_sites pos_diff_mat = np.zeros(shape=(n_sites, n_sites), dtype=np.float32) # print(r2_est.shape, pos_diff_mat.shape) for i in np.arange(len(pos)): for j in np.arange(i): # Calculating the absolute difference in position pos_diff_mat[i, j] = np.abs(pos[i] - pos[j]) # Extract entries that matter (and are matched) r2 = r2_est[pos_diff_mat > 0] pos_diff = pos_diff_mat[pos_diff_mat > 0] return (pos_diff, r2)
def _pos_r2(ts): """Obtain vectors of position differences and r^2 per pair of sites. Arguments --------- ts : msprime.TreeSequence tree sequence object Returns ------- pos_diff : np.array position difference for pairs of snps r2 : np.array r^2 as computed between the different sites """ ld_calc = msp.LdCalculator(ts) r2_est = ld_calc.r2_matrix() # Computing positions and indices pos = np.array([s.position for s in ts.sites()], dtype=np.float32) pos_diff_mat = np.zeros(shape=(pos.shape[0], pos.shape[0]), dtype=np.float32) for i in np.arange(len(pos)): for j in np.arange(i): # Calculating the absolute difference in position pos_diff_mat[i, j] = np.abs(pos[i] - pos[j]) # Extract entries that matter (and are matched) r2 = r2_est[pos_diff_mat > 0] pos_diff = pos_diff_mat[pos_diff_mat > 0] # Set undefined values to be 1 (due to non-segregating issues...) r2[np.isnan(r2)] = 1.0 return (pos_diff, r2)
def test_deprecated_aliases(self): ts = msprime.simulate(20, mutation_rate=10, random_seed=15) ts = tsutil.subsample_sites(ts, self.num_test_sites) ldc = msprime.LdCalculator(ts) A = ldc.get_r2_matrix() B = ldc.r2_matrix() self.assertTrue(np.array_equal(A, B)) a = ldc.get_r2_array(0) b = ldc.r2_array(0) self.assertTrue(np.array_equal(a, b)) self.assertEqual(ldc.get_r2(0, 1), ldc.r2(0, 1))
def thread_worker(thread_index): ld = msp.LdCalculator(ts) chunk_size = int(math.ceil(len(mask) / num_threads)) nextSite = thread_index * chunk_size stop = nextSite + chunk_size while True: mask[nextSite] = True r2 = (ld.r2_array(nextSite) <= thresh) if nextSite > stop or len(r2) == 0 or not np.any(r2): break nextSite += (1 + np.argmax(r2))
def verify_max_mutations(self, ts): """ Verifies that the max mutations parameter works as expected. """ mutations = list(ts.mutations()) ldc = msprime.LdCalculator(ts) A = ldc.get_r2_matrix() j = len(mutations) // 2 for k in range(j): a = ldc.get_r2_array(j, max_mutations=k) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a)) a = ldc.get_r2_array(j, max_mutations=k, direction=msprime.REVERSE) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j - k:j], a[::-1]))
def thread_worker(thread_index): ld_calc = msprime.LdCalculator(tree_sequence) chunk_size = int(math.ceil(len(focal_mutations) / num_threads)) start = thread_index * chunk_size for focal_mutation in focal_mutations[start: start + chunk_size]: a = ld_calc.get_r2_array( focal_mutation, max_distance=max_distance, direction=msprime.REVERSE) rev_indexes = focal_mutation - np.nonzero(a >= r2_threshold)[0] - 1 a = ld_calc.get_r2_array( focal_mutation, max_distance=max_distance, direction=msprime.FORWARD) fwd_indexes = focal_mutation + np.nonzero(a >= r2_threshold)[0] + 1 indexes = np.concatenate((rev_indexes[::-1], fwd_indexes)) results[focal_mutation] = indexes progress_bar.update()
def ld_matrix_example(): ts = msprime.simulate(100, recombination_rate=10, mutation_rate=20, random_seed=1) ld_calc = msprime.LdCalculator(ts) A = ld_calc.get_r2_matrix() # Now plot this matrix. x = A.shape[0] / pyplot.rcParams['savefig.dpi'] x = max(x, pyplot.rcParams['figure.figsize'][0]) fig, ax = pyplot.subplots(figsize=(x, x)) fig.tight_layout(pad=0) im = ax.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues") ax.set_xticks([]) ax.set_yticks([]) for s in 'top', 'bottom', 'left', 'right': ax.spines[s].set_visible(False) pyplot.gcf().colorbar(im, shrink=.5, pad=0) pyplot.savefig("_static/ld.svg")
def two_bins(NA, N1, N2, Ts, M1, M2): NA = NA N1 = N1 N2 = N2 Ts = Ts M1 = M1 M2 = M2 population_configurations = [ msprime.PopulationConfiguration(sample_size=0, initial_size=N1), msprime.PopulationConfiguration(sample_size=50, initial_size=N2) ] migration_matrix = [[0, M2], [0, 0]] demographic_events = [ msprime.MigrationRateChange(time=Ts / 2, rate=M1, matrix_index=(0, 1)), #msprime.MigrationRateChange(time=Ts/2, rate=M1, matrix_index=(1, 0)), msprime.MassMigration(time=Ts, source=1, destination=0, proportion=1.0) ] #dp = msprime.DemographyDebugger( # Ne=NA, # population_configurations=population_configurations, # migration_matrix=migration_matrix, # demographic_events=demographic_events) #dp.print_history() replicates = 500000 sim = msprime.simulate(Ne=NA, population_configurations=population_configurations, migration_matrix=migration_matrix, demographic_events=demographic_events, mutation_rate=1e-7, recombination_rate=1e-8, length=100000, num_replicates=replicates) pi = np.zeros(replicates) seg = np.zeros(replicates) ld = np.zeros(replicates) for j, s in enumerate(sim): pi[j] = s.get_pairwise_diversity() seg[j] = s.get_num_mutations() ld[j] = np.var(msprime.LdCalculator(s).get_r2_matrix()) #return(np.array([np.mean(pi),np.var(pi),np.mean(seg),np.var(seg)])) #return(np.array([np.var(pi),np.var(seg), np.var(ld)])) return (np.array([np.var(seg)]))
def test_get_r2_array_multiple_instances(self): # This is the nominal case where we have a separate LdCalculator # instance in each thread. ts = self.get_tree_sequence() ld_calc = msprime.LdCalculator(ts) A = ld_calc.get_r2_matrix() m = A.shape[0] del ld_calc def worker(thread_index, results): ld_calc = msprime.LdCalculator(ts) results[thread_index] = np.array( ld_calc.get_r2_array(thread_index)) results = run_threads(worker, m) for j in range(m): self.assertTrue(np.allclose(results[j], A[j, j + 1:]))
def test_get_r2_array_single_instance(self): # This is the degenerate case where we have a single LdCalculator # instance shared by the threads. We should have only one thread # actually executing get_r2_array() at one time. Because the buffer # is shared by many different instances, we can't make any assertions # about the returned values --- they are essentially gibberish. # However, we shouldn't crash and burn, which is what this test # is here to check for. ts = self.get_tree_sequence() ld_calc = msprime.LdCalculator(ts) m = ts.get_num_mutations() def worker(thread_index, results): results[thread_index] = ld_calc.get_r2_array(thread_index).shape results = run_threads(worker, m) for j in range(m): self.assertEqual(results[j][0], m - j - 1)
def test_get_r2_single_instance(self): # This is the degenerate case where we have a single LdCalculator # instance shared by the threads. We should have only one thread # actually executing get_r2() at one time. ts = self.get_tree_sequence() ld_calc = msprime.LdCalculator(ts) A = ld_calc.get_r2_matrix() m = A.shape[0] def worker(thread_index, results): row = np.zeros(m) results[thread_index] = row for j in range(m): row[j] = ld_calc.get_r2(thread_index, j) results = run_threads(worker, m) for j in range(m): self.assertTrue(np.allclose(results[j], A[j]))
def worker(thread_index, results): ld_calc = msprime.LdCalculator(ts) results[thread_index] = np.array( ld_calc.get_r2_array(thread_index))
def worker(thread_index, results): ld_calc = msprime.LdCalculator(ts) row = np.zeros(m) results[thread_index] = row for j in range(m): row[j] = ld_calc.get_r2(thread_index, j)