def verify_matrix(self, ts): m = ts.get_num_sites() ldc = tskit.LdCalculator(ts) A = ldc.get_r2_matrix() self.assertEqual(A.shape, (m, m)) B = get_r2_matrix(ts) self.assertTrue(np.allclose(A, B)) # Now look at each row in turn, and verify it's the same # when we use get_r2 directly. for j in range(m): a = ldc.get_r2_array(j, direction=tskit.FORWARD) b = A[j, j + 1:] self.assertEqual(a.shape[0], m - j - 1) self.assertEqual(b.shape[0], m - j - 1) self.assertTrue(np.allclose(a, b)) a = ldc.get_r2_array(j, direction=tskit.REVERSE) b = A[j, :j] self.assertEqual(a.shape[0], j) self.assertEqual(b.shape[0], j) self.assertTrue(np.allclose(a[::-1], b)) # Now check every cell in the matrix in turn. for j in range(m): for k in range(m): self.assertAlmostEqual(ldc.get_r2(j, k), A[j, k])
def verify_max_distance(self, ts): """ Verifies that the max_distance parameter works as expected. """ mutations = list(ts.mutations()) ldc = tskit.LdCalculator(ts) A = ldc.get_r2_matrix() j = len(mutations) // 2 for k in range(j): x = mutations[j + k].position - mutations[j].position a = ldc.get_r2_array(j, max_distance=x) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a)) x = mutations[j].position - mutations[j - k].position a = ldc.get_r2_array(j, max_distance=x, direction=tskit.REVERSE) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j - k:j], a[::-1])) L = ts.get_sequence_length() m = len(mutations) a = ldc.get_r2_array(0, max_distance=L) self.assertEqual(a.shape[0], m - 1) self.assertTrue(np.allclose(A[0, 1:], a)) a = ldc.get_r2_array(m - 1, max_distance=L, direction=tskit.REVERSE) self.assertEqual(a.shape[0], m - 1) self.assertTrue(np.allclose(A[m - 1, :-1], a[::-1]))
def test_deprecated_aliases(self): ts = msprime.simulate(20, mutation_rate=10, random_seed=15) ts = tsutil.subsample_sites(ts, self.num_test_sites) ldc = tskit.LdCalculator(ts) A = ldc.get_r2_matrix() B = ldc.r2_matrix() self.assertTrue(np.array_equal(A, B)) a = ldc.get_r2_array(0) b = ldc.r2_array(0) self.assertTrue(np.array_equal(a, b)) self.assertEqual(ldc.get_r2(0, 1), ldc.r2(0, 1))
def ld_matrix_example(ts): ld_calc = tskit.LdCalculator(ts) A = ld_calc.r2_matrix() # Now plot this matrix. x = A.shape[0] / pyplot.rcParams["figure.dpi"] x = max(x, pyplot.rcParams["figure.figsize"][0]) fig, ax = pyplot.subplots(figsize=(x, x)) fig.tight_layout(pad=0) im = ax.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues") ax.set_xticks([]) ax.set_yticks([]) for s in "top", "bottom", "left", "right": ax.spines[s].set_visible(False) pyplot.gcf().colorbar(im, shrink=0.5, pad=0)
def verify_max_mutations(self, ts): """ Verifies that the max mutations parameter works as expected. """ mutations = list(ts.mutations()) ldc = tskit.LdCalculator(ts) A = ldc.get_r2_matrix() j = len(mutations) // 2 for k in range(j): a = ldc.get_r2_array(j, max_mutations=k) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j + 1:j + 1 + k], a)) a = ldc.get_r2_array(j, max_mutations=k, direction=tskit.REVERSE) self.assertEqual(a.shape[0], k) self.assertTrue(np.allclose(A[j, j - k:j], a[::-1]))
def test_get_r2_array_multiple_instances(self): # This is the nominal case where we have a separate LdCalculator # instance in each thread. ts = self.get_tree_sequence() ld_calc = tskit.LdCalculator(ts) A = ld_calc.get_r2_matrix() m = A.shape[0] del ld_calc def worker(thread_index, results): ld_calc = tskit.LdCalculator(ts) results[thread_index] = np.array( ld_calc.get_r2_array(thread_index)) results = run_threads(worker, m) for j in range(m): assert np.allclose(results[j], A[j, j + 1:])
def test_get_r2_single_instance(self): # This is the degenerate case where we have a single LdCalculator # instance shared by the threads. We should have only one thread # actually executing get_r2() at one time. ts = self.get_tree_sequence() ld_calc = tskit.LdCalculator(ts) A = ld_calc.get_r2_matrix() m = A.shape[0] def worker(thread_index, results): row = np.zeros(m) results[thread_index] = row for j in range(m): row[j] = ld_calc.get_r2(thread_index, j) results = run_threads(worker, m) for j in range(m): assert np.allclose(results[j], A[j])
def test_get_r2_array_single_instance(self): # This is the degenerate case where we have a single LdCalculator # instance shared by the threads. We should have only one thread # actually executing get_r2_array() at one time. Because the buffer # is shared by many different instances, we can't make any assertions # about the returned values --- they are essentially gibberish. # However, we shouldn't crash and burn, which is what this test # is here to check for. ts = self.get_tree_sequence() ld_calc = tskit.LdCalculator(ts) m = ts.get_num_mutations() def worker(thread_index, results): results[thread_index] = ld_calc.get_r2_array(thread_index).shape results = run_threads(worker, m) for j in range(m): assert results[j][0] == m - j - 1
def test_get_r2_multiple_instances(self): # This is the nominal case where we have a separate LdCalculator # instance in each thread. ts = self.get_tree_sequence() ld_calc = tskit.LdCalculator(ts) A = ld_calc.get_r2_matrix() del ld_calc m = A.shape[0] def worker(thread_index, results): ld_calc = tskit.LdCalculator(ts) row = np.zeros(m) results[thread_index] = row for j in range(m): row[j] = ld_calc.get_r2(thread_index, j) results = run_threads(worker, m) for j in range(m): self.assertTrue(np.allclose(results[j], A[j]))
def main(): ## Define command line args parser = argparse.ArgumentParser(description="") parser.add_argument("--trees", required=True, dest="trees", type=str, help="Coalescent trees for the simulation") parser.add_argument( "--output", required=True, dest="output", type=str, help= "What name do you want to give to the output file? [DON'T GIVE FILE EXTENSION, THE SCRIPT MAKES TWO OUTPUT FILES" ) parser.add_argument( "--island", dest="island", action="store_true", help="Use this flag if the data comes from the Island model") args = parser.parse_args() wins = [i for i in range(0, 1000000 + 3000, 1000)] ts = pyslim.load(args.trees) # print("recap") mut_rate = 1e-7 # HARD CODED, SO CHANGE IF NECESSARY pops = {} for i in ts.individuals_alive_at(0): if ts.individual(i).population == 999: continue try: pops[ts.individual(i).population].append(i) except KeyError: pops[ts.individual(i).population] = [i] # Let's take a sample of 40 pops and 20 diploids from each and make a new tree from them sampled_indivs = [] for i in np.random.choice(list(pops.keys()), 1, replace=False): for j in np.random.choice(np.array(pops[i]), 10, replace=False): sampled_indivs.extend(ts.individual(j).nodes) r2_tree2 = ts.simplify(sampled_indivs) #print(i.population) # print(ts.sample_size) ## Sprinkle mutations onto the coalescent tree # print('Sprinkling mutations onto trees') # print("sprinkle 1") sprinkled = msprime.mutate(r2_tree2, rate=mut_rate, keep=True) # MAF_005 = [] # for i in sprinkled.variants(): # print(i.position) # if sum(i.genotypes)/len(i.genotypes) > 0.01: # MAF_005.append(1) # else: # MAF_005.append(0) # print( np.array(MAF_005).sum() ) # return # print("sprinkle 2") # sprinkled_recap = msprime.mutate(ts, rate= mut_rate, keep=True) #### print(sprinkled.diversity( windows = [0,1e6,1e6+3000]) ) # sprinkled_recap.diversity( windows = wins+ [1003000] #)) # plt.step( np.array(wins), sprinkled.diversity( windows = wins+ [1003000top #]) , "xkcd:crimson" ) # plt.step( np.array(wins), sprinkled_recap.diversity( windows = wins+ [1003000]) , "peachpuff" ) # plt.show() # return positions = [i.position for i in sprinkled.variants() if i.position < 1e4] # print(len(positions)) LD_interval = sprinkled.keep_intervals(np.array([[0, 20000]])) LD = tskit.LdCalculator(LD_interval) LD_mat = LD.r2_matrix() for i in range(len(positions)): for j in range(len(positions)): if j >= i: continue else: pw_distances_ij = abs(positions[i] - positions[j]) if pw_distances_ij > 100000: continue print(pw_distances_ij, LD_mat[i, j])
def worker(thread_index, results): ld_calc = tskit.LdCalculator(ts) row = np.zeros(m) results[thread_index] = row for j in range(m): row[j] = ld_calc.get_r2(thread_index, j)
def worker(thread_index, results): ld_calc = tskit.LdCalculator(ts) results[thread_index] = np.array( ld_calc.get_r2_array(thread_index))
def getLD(tree_sequence, output_name): pops = {} for i in tree_sequence.individuals_alive_at(0): if tree_sequence.individual(i).population == 999: continue try: pops[tree_sequence.individual(i).population].append(i) except KeyError: pops[tree_sequence.individual(i).population] = [i] # Let's take a sample of 10 individuals from 1 pops and make a new tree from them sampled_indivs_1 = [] for i in np.random.choice(list(pops.keys()), 1, replace=False): for j in np.random.choice(np.array(pops[i]), 10, replace=False): sampled_indivs_1.extend(tree_sequence.individual(j).nodes) diploids_per_pop = 2 # Let's take a sample of 10 individuals from 20 pops and make a new tree from them sampled_indivs_2 = [] for i in np.random.choice(list(pops.keys()), 20, replace=False): for j in np.random.choice(np.array(pops[i]), 2, replace=False): sampled_indivs_2.extend(tree_sequence.individual(j).nodes) r2_tree_1 = tree_sequence.simplify(sampled_indivs_1) mut_tree_1 = msprime.mutate(r2_tree_1, 1e-7) r2_tree_2 = tree_sequence.simplify(sampled_indivs_2) mut_tree_2 = msprime.mutate(r2_tree_2, 1e-7) # print(ts.sample_size) ## Sprinkle mutations onto the coalescent tree # print('Sprinkling mutations onto trees') # print("sprinkle 1") # sprinkled = msprime.mutate(r2_tree2, rate= mut_rate, keep=True) LD_intervals = [[9000000, 9010000 - 1], [9200000, 9210000 - 1], [9400000, 9410000 - 1], [9600000, 9610000 - 1], [9800000, 9810000 - 1]] LD_intervals = [[7260000, 7270000 - 1]] LD_output_within = open("within_" + output_name, "w") for interval in LD_intervals: positions = [ i.position for i in mut_tree_1.variants() if i.position >= interval[0] and i.position <= interval[1] ] LD_interval = mut_tree_1.keep_intervals(np.array([interval])) LD = tskit.LdCalculator(LD_interval) LD_mat = LD.r2_matrix() freq_dict = {} for variant in LD_interval.variants(): ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit allele_freq = ((variant.genotypes[::2] + variant.genotypes[1::2]) / 2).mean() freq_dict[variant.position] = allele_freq print(len(freq_dict.keys())) print(len(positions)) for i in range(len(positions)): for j in range(len(positions)): if j >= i: continue else: if freq_dict[positions[i]] < 0.1 or freq_dict[ positions[j]] < 0.1: continue pw_distances_ij = abs(positions[i] - positions[j]) if pw_distances_ij > 10000: continue LD_output_within.write(",".join( [str(int(pw_distances_ij)), str(LD_mat[i, j])]) + "\n") LD_output_within.close() print("Now getting 'LD' between pops") LD_output_between = open("between_" + output_name, "w") for interval in LD_intervals: positions = [ i.position for i in mut_tree_2.variants() if i.position >= interval[0] and i.position <= interval[1] ] LD_interval = mut_tree_2.keep_intervals(np.array([interval])) freq_dict = {} for variant in LD_interval.variants(): ## Calculating the genotype freqs this way assumes that the ref and alt are coded as 1s and 0s, repectively. I'll keep that for now, but will change in a bit genotypes_as_freqs = (variant.genotypes[::2] + variant.genotypes[1::2]) / 2 freqs_per_pop = ([ i.mean() for i in chunks(genotypes_as_freqs, int(diploids_per_pop)) ]) if sum(freqs_per_pop) / len(freqs_per_pop) < 0.1: continue freq_dict[variant.position] = freqs_per_pop for pair in itertools.product(freq_dict.keys(), repeat=2): pw_distances_ij = abs(pair[0] - pair[1]) if pw_distances_ij == 0: continue LD_output_between.write(",".join([ str(int(pw_distances_ij)), str( scipy.stats.pearsonr(freq_dict[pair[0]], freq_dict[ pair[1]])[0]**2) ]) + "\n") LD_output_between.close() return
Ne=Ne, recombination_map=recombination_map, model="dtwf", mutation_rate=mutation_rate) first_chrom2 = 'not assigned' n1 = 0 n2 = 0 for variant in tree_sequence.variants(): if variant.position > 1e8: first_chrom2 = variant.index n2 += 1 if variant.position < (1e8 - 1): n1 += 1 r2 = tskit.LdCalculator(tree_sequence).r2_matrix() allele_frequency_filter = Waples2011.get_allele_filter( tree_sequence, 0.2, S) chrom2_filter = np.logical_and(allele_frequency_filter, np.append(np.zeros(n1), np.ones(n2))) chrom1_filter = np.logical_and(allele_frequency_filter, np.append(np.ones(n1), np.zeros(n2))) filtered_matrix = r2[chrom1_filter][:, chrom2_filter] obs_r2_drift = np.mean(Waples2011.get_r2_drift(filtered_matrix, S)) obs_r2_uncorrected = np.mean(filtered_matrix) N_prediction = 1 / (3 * obs_r2_drift) r2_corrected_list.append(obs_r2_drift) r2_observed_list.append(obs_r2_uncorrected)
def get_LD_matrix(tree_sequence): return tskit.LdCalculator(tree_sequence).r2_matrix()
def myLDcalc(tree_sequence, ): for variant in tree_sequence.variants(): if variant.site.id == 0: gen1 = variant.genotypes if variant.site.id == 1: gen2 = variant.genotypes break n = len(gen1) pA = sum(gen1) / n pB = sum(gen2) / n pAB= sum(np.logical_and(gen1, gen2))/n pab= sum(np.logical_and(1-gen1, 1-gen2))/n pAb= sum(np.logical_and(gen1, 1-gen2))/n paB= sum(np.logical_and(1-gen1, gen2))/n print(sum([pAB, pAb, paB, pab])) r_unphased = ( pAB - pA * pB ) / np.sqrt(pA * (1-pA) * pB * (1-pB)) r2_phased = (pAB * pab - pAb * paB)**2 / (pA * (1-pA) * pB * (1-pB)) return r_unphased**2, r2_phased for variant in tree_sequence.variants(): print( variant.site.id, variant.site.position, variant.alleles, sep = '\t' ) print(count_mutations(tree_sequence)) print(tskit.LdCalculator(tree_sequence).r2_matrix()) print(tskit.LdCalculator(tree_sequence).r2(0,1)) print(myLDcalc(tree_sequence))
def get_LD_estimate(Ne, S, n_subpops, m, rate=False, n_loci=100): # set up population parameters ## migration matrix M = get_migration_matrix(m, n_subpops) ## sample population_configurations = [ msprime.PopulationConfiguration(sample_size=S) ] + [ msprime.PopulationConfiguration(sample_size=0) for i in range(n_subpops - 1) ] ## mutation and recombination : adjust based on Ne to get same number of mutations! mutation_rate = 0 * 5e-9 / 40 recom_rate = 1e-8 positions = [0, 1e8 - 1, 1e8, 2e8 - 1] rates = [recom_rate, 0.5, recom_rate, 0] num_loci = int(positions[-1]) recombination_map = msprime.RecombinationMap(positions=positions, rates=rates, num_loci=num_loci) tree_sequence = msprime.simulate( Ne=Ne, recombination_map=recombination_map, mutation_rate=mutation_rate, population_configurations=population_configurations, migration_matrix=M, model="dtwf") if not rate: print("Calculating rate...") L = 0 for tree in tree_sequence.trees(): L += tree.get_length() * tree.get_total_branch_length() rate = n_loci / L tree_sequence = msprime.mutate(tree_sequence, rate=rate, random_seed=None, model=None, keep=False, start_time=None, end_time=None) first_chrom2 = 'not assigned' n1 = 0 n2 = 0 for variant in tree_sequence.variants(): if variant.position > 1e8: first_chrom2 = variant.index n2 += 1 if variant.position < (1e8 - 1): n1 += 1 r2 = tskit.LdCalculator(tree_sequence).r2_matrix() fil = np.zeros((n1 + n2, n1 + n2)) fil[:n1, n1:] = 1 i = 0 for v in tree_sequence.variants(): if sum(v.genotypes / len(v.genotypes)) < 0.05: fil[:, i] = 0 fil[i, :] = 0 i += 1 fil = fil.astype(int) r2 = np.where(fil, r2, np.zeros_like(r2)) r2_mean = np.sum(r2) / np.sum(fil) r2_drift = r2_mean - (1 / (1 * S) / (1 - 1 / (1 * S))) Ne_est = 1 / (3 * r2_drift) #print(n1+n2) #print(Ne_est) return Ne_est, rate
def getLD(tree_sequence, output_name, selection=False): if selection == True: sel_pos_raw = [] for v in tree_sequence.variants(): if v.genotypes.sum() < 4000: continue ## A rough way of removing low frueqnecy alleles else: sel_pos_raw.append(round(v.position / 10000) * 10000) LD_intervals = [[s, s + 9999] for s in list(set(sel_pos_raw))] else: ## these genes always evolve neutrally LD_intervals = [[9000000, 9010000 - 1], [9200000, 9210000 - 1], [9400000, 9410000 - 1], [9600000, 9610000 - 1], [9800000, 9810000 - 1]] print(LD_intervals) pops = {} for i in tree_sequence.individuals_alive_at(0): if tree_sequence.individual(i).population == 999: continue try: pops[tree_sequence.individual(i).population].append(i) except KeyError: pops[tree_sequence.individual(i).population] = [i] # Let's take a sample of 10 individuals from 1 pops and make a new tree from them sampled_indivs = [] for i in np.random.choice(list(pops.keys()), 1, replace=False): for j in np.random.choice(np.array(pops[i]), 10, replace=False): sampled_indivs.extend(tree_sequence.individual(j).nodes) r2_tree2 = tree_sequence.simplify(sampled_indivs) mut_tree = msprime.mutate(r2_tree2, 1e-7) # print(pops) # print(ts.sample_size) ## Sprinkle mutations onto the coalescent tree # print('Sprinkling mutations onto trees') # print("sprinkle 1") # sprinkled = msprime.mutate(r2_tree2, rate= mut_rate, keep=True) LD_output = open(output_name, "w") for inter in range(len(LD_intervals)): interval = LD_intervals[inter] positions = [ i.position for i in mut_tree.variants() if i.position >= interval[0] and i.position <= interval[1] ] LD_interval = mut_tree.keep_intervals(np.array([interval])) LD = tskit.LdCalculator(LD_interval) LD_mat = LD.r2_matrix() for i in range(len(positions)): for j in range(len(positions)): if j >= i: continue else: pw_distances_ij = abs(positions[i] - positions[j]) if pw_distances_ij > 10000: continue LD_output.write(",".join( [str(inter), str(pw_distances_ij), str(LD_mat[i, j])]) + "\n") LD_output.close()