def test_rogers_huff_r(self): gn = [[0, 1, 2], [0, 1, 2]] expect = 1. actual = allel.rogers_huff_r(gn) eq(expect, actual) gn = [[0, 1, 2], [2, 1, 0]] expect = -1. actual = allel.rogers_huff_r(gn) eq(expect, actual) gn = [[0, 0, 0], [1, 1, 1]] actual = allel.rogers_huff_r(gn) assert np.isnan(actual) gn = [[0, 1, 0, 1], [0, 1, 1, 0]] expect = 0 actual = allel.rogers_huff_r(gn) eq(expect, actual) gn = [[0, 1, 2, -1], [0, 1, 2, 2]] expect = 1. actual = allel.rogers_huff_r(gn) eq(expect, actual) gn = [[0, 1, 2, 2], [0, 1, 2, -1]] expect = 1. actual = allel.rogers_huff_r(gn) eq(expect, actual) gn = [[0, 1, 2], [0, 1, -1]] expect = 1. actual = allel.rogers_huff_r(gn) eq(expect, actual) gn = [[0, 2], [2, 0], [0, 1]] expect = [-1, 1, -1] actual = allel.rogers_huff_r(gn) assert_array_close(expect, actual) gn = [[0, 2, 0], [0, 2, 0], [2, 0, 2], [0, 2, -1]] expect = [1, -1, 1, -1, 1, -1] actual = allel.rogers_huff_r(gn) assert_array_close(expect, actual)
def binned_ld(genotypes, positions, window_size, num_bins=20): bins = dict((i, []) for i in range(num_bins)) exponent_start = 8 base = np.exp(np.log(window_size) / (exponent_start + num_bins)) def bin_index(pos1, pos2): dist = np.abs(pos2 - pos1) return int(max(np.floor(np.log(dist) / np.log(base) - exponent_start), 0)) for window_start in range(positions[0], positions[-1], window_size): window_indices = np.logical_and(positions >= window_start, positions < window_start + window_size) window_positions = positions[window_indices] window_gn = genotypes[window_indices] if len(window_positions) == 0: continue r = allel.rogers_huff_r(window_gn) r_squared_matrix = squareform(r ** 2) for i, j in itertools.combinations(range(len(window_positions)), 2): r_squared = r_squared_matrix[i, j] if np.isnan(r_squared): continue index = bin_index(window_positions[i], window_positions[j]) bins[index].append(r_squared) sizes = [base ** i for i in range(exponent_start + 1, exponent_start + num_bins + 1)] binned_r_squared = [np.mean(bins[i]) for i in range(num_bins)] return sizes, binned_r_squared
def calcAndAppendStatValDiplo(alleleCounts, snpLocs, statName, subWinStart, subWinEnd, statVals, instanceIndex, subWinIndex, genosInSubWin, unmasked): genosNAlt = genosInSubWin.to_n_alt() if statName == "tajD": statVals[statName][instanceIndex].append(allel.stats.diversity.tajima_d( alleleCounts, pos=snpLocs, start=subWinStart, stop=subWinEnd)) elif statName == "pi": statVals[statName][instanceIndex].append(allel.stats.diversity.sequence_diversity( snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) elif statName == "thetaW": statVals[statName][instanceIndex].append(allel.stats.diversity.watterson_theta( snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) elif statName == "thetaH": statVals[statName][instanceIndex].append(thetah( snpLocs, alleleCounts, start=subWinStart, stop=subWinEnd, is_accessible=unmasked)) elif statName == "fayWuH": statVals[statName][instanceIndex].append( statVals["thetaH"][instanceIndex][subWinIndex]-statVals["pi"][instanceIndex][subWinIndex]) elif statName == "HapCount": statVals[statName][instanceIndex].append(len(hapsInSubWin.distinct())) elif statName == "nDiplos": diplotypeCounts = shicstats.getHaplotypeFreqSpec(genosNAlt) nDiplos = diplotypeCounts[genosNAlt.shape[1]] statVals["nDiplos"][instanceIndex].append(nDiplos) diplotypeCounts = diplotypeCounts[:-1] dh1 = garudH1(diplotypeCounts) dh2 = garudH2(diplotypeCounts) dh12 = garudH12(diplotypeCounts) if "diplo_H1" in statVals: statVals["diplo_H1"][instanceIndex].append(dh1) if "diplo_H12" in statVals: statVals["diplo_H12"][instanceIndex].append(dh12) if "diplo_H2/H1" in statVals: statVals["diplo_H2/H1"][instanceIndex].append(dh2/dh1) elif statName == "diplo_ZnS": if genosNAlt.shape[0] == 1: statVals["diplo_ZnS"][instanceIndex].append(0.0) statVals["diplo_Omega"][instanceIndex].append(0.0) else: r2Matrix = allel.rogers_huff_r(genosNAlt, fill=0.0) statVals["diplo_ZnS"][instanceIndex].append(np.nanmean(r2Matrix)) r2Matrix2 = squareform(r2Matrix ** 2) statVals["diplo_Omega"][instanceIndex].append( shicstats.omega(r2Matrix2)[0]) elif statName == "distVar": dists = shicstats.pairwiseDiffsDiplo( genosNAlt)/float(unmasked[subWinStart-1:subWinEnd].count(True)) statVals["distVar"][instanceIndex].append(np.var(dists, ddof=1)) statVals["distSkew"][instanceIndex].append(scipy.stats.skew(dists)) statVals["distKurt"][instanceIndex].append(scipy.stats.kurtosis(dists)) elif statName in ["diplo_H12", "diplo_H123", "diplo_H2/H1", "distVar", "distSkew", "distKurt", "diplo_Omega"]: if not len(statVals[statName][instanceIndex]) == subWinIndex+1: print(statName, instanceIndex, subWinIndex+1) print(statVals["diplo_H1"][instanceIndex], statVals["diplo_H12"][instanceIndex]) sys.exit()
def linkage_disequilibrium(ts, span=40000, bins=20, min_obs_per_bin=8, max_sequence_length=1e6): """ R^2 as a function of site-separation distance, for `bins` bins up to a site-separation distance of `span` bp. """ if ts.sequence_length > max_sequence_length: ts = ts.keep_intervals([(0, max_sequence_length)], record_provenance=False) position = [site.position for site in ts.sites()] num_sites = len(position) assert num_sites == int(ts.num_sites) nans = np.full(bins, np.nan) if num_sites >= min_obs_per_bin: gts = np.expand_dims(ts.genotype_matrix(), axis=-1) gn = allel.GenotypeArray(gts, dtype="i1").to_n_alt() ld = allel.rogers_huff_r(gn)**2 assert len(ld) == num_sites * (num_sites - 1) // 2 # Bin the pairwise site R^2 in `ld` by site separation distance. r2 = np.zeros(bins) n = np.zeros(bins) i = 0 for j in range(num_sites): for k in range(j + 1, num_sites): distance = position[k] - position[j] if distance >= span: break index = int(distance * bins / span) if not np.isnan(ld[i]): r2[index] += ld[i] n[index] += 1 i += 1 # Divide `r2` by `n`, but return NaN where n has insufficient observations. r2 = np.divide(r2, n, out=nans, where=n >= min_obs_per_bin) else: # Too few segregating sites to do anything meaningful. # LD plots may be blank. r2 = nans return { f"$\Delta$bp$\in[{span*k/bins/1000:.0f}\,$k$," # NOQA f"{span*(k+1)/bins/1000:.0f}\,$k$)$": r2[k] # NOQA for k in range(bins) }
def ld(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5): window_size = int(window_size) reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples]) original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code] synthetic_genotypes, synthetic_positions = remove_fixed_sites(allel.GenotypeArray(np.copy(synthetic_genotypes)).to_n_alt(), np.copy(synthetic_positions)) reference_genotypes, reference_positions = remove_fixed_sites(allel.GenotypeArray(np.copy(original_reference_genotypes)).to_n_alt(), np.copy(reference_positions)) # # plot binned ld plt.title('Binned Linkage Disequilibrium') sizes, binned_r_squared = binned_ld(synthetic_genotypes, synthetic_positions, window_size) plt.plot(sizes, binned_r_squared, label='Synthetic {}'.format(synthetic_population_code)) sizes, binned_r_squared = binned_ld(reference_genotypes, reference_positions, window_size) plt.plot(sizes, binned_r_squared, label='{}'.format(synthetic_population_code)) plt.xlabel('Distance (bp)') plt.ylabel('LD (r squared)') plt.xscale('log') plt.legend() plt.savefig(os.path.join(FIGURES_DIR, '{}.binned_ld.png'.format(synthetic_population_code))) plt.close(plt.gcf()) # plot pairwise ld np.random.seed(SEED) window_start = np.random.randint(synthetic_positions[0], synthetic_positions[-1] - window_size) synthetic_window_indices = np.logical_and(np.logical_and(synthetic_positions >= window_start, synthetic_positions < window_start + window_size), np.isin(synthetic_positions, reference_positions)) reference_window_indices = np.logical_and(np.logical_and(reference_positions >= window_start, reference_positions < window_start + window_size), np.isin(reference_positions, synthetic_positions)) synthetic_window_gn = synthetic_genotypes[synthetic_window_indices] reference_window_gn = reference_genotypes[reference_window_indices] synthetic_r = allel.rogers_huff_r(synthetic_window_gn) reference_r = allel.rogers_huff_r(reference_window_gn) synthetic_r_squared_matrix = squareform(synthetic_r ** 2) reference_r_squared_matrix = squareform(reference_r ** 2) ax = plot_pairwise_ld(synthetic_r_squared_matrix, reference_r_squared_matrix, colorbar=True, imshow_kwargs={'cmap': 'cividis'}) plt.title('SNP Correlation in {}kb Window'.format(window_size // 1000)) plt.savefig(os.path.join(FIGURES_DIR, '{}.pairwise_ld.png'.format(synthetic_population_code))) plt.close(plt.gcf())
def linkage_disequilibrium(ts, span=2 * 10**5, bins=50, min_obs_per_bin=8): """ Average R^2 in `bins` bins over the first `span` bases of ts. """ span = min(ts.sequence_length, span) ts = ts.keep_intervals([(0, span)], record_provenance=False) position = [site.position for site in ts.sites()] num_sites = len(position) assert num_sites == int(ts.num_sites) nans = np.full(bins, np.nan) if num_sites >= min_obs_per_bin: gts = np.expand_dims(ts.genotype_matrix(), axis=-1) gn = allel.GenotypeArray(gts, dtype='i1').to_n_alt() ld = allel.rogers_huff_r(gn)**2 assert len(ld) == num_sites * (num_sites - 1) // 2 # Bin the pairwise site R^2 in `ld` by site separation distance. r2 = np.zeros(bins) n = np.zeros(bins) i = 0 for j in range(num_sites): for k in range(j + 1, num_sites): distance = position[k] - position[j] index = int(distance * bins / span) if not np.isnan(ld[i]): r2[index] += ld[i] n[index] += 1 i += 1 # divide `r2` by `n`, but return NaN where n has insufficient observations. r2 = np.divide(r2, n, out=nans, where=n >= min_obs_per_bin) else: # Too few segregating sites to do anything meaningful. # LD plots may be blank. r2 = nans a = f"{span//bins//1000}k" # width of one bin, in kb b = f"{span//8//1000}k" c = f"{span//4//1000}k" d = f"{span//2//1000}k" return { f"$R^2$[<{a}]": r2[0], f"$R^2$[{b}]": r2[bins // 8], f"$R^2$[{c}]": r2[bins // 4], f"$R^2$[{d}]": r2[bins // 2] }
def plot_ld(gn, title): m = allel.rogers_huff_r(gn)**2 ax = allel.plot_pairwise_ld(m) ax.set_title(title)
def ld_prune(gn, variants, cadd, thold): """ Method used inside goPDX class (filteringSNPS) input: subset of the gn, variants and cadd associated to this subset output: subset of the input subset without high correlated snps and cadd above 1 """ import allel # https://en.wikipedia.org/wiki/Linkage_disequilibrium # Estimate the linkage disequilibrium parameter r for each pair of variants r = allel.rogers_huff_r(gn) correlations = squareform(r**2) correlations = pd.DataFrame(correlations) correlations.fillna(1, inplace=True) correlations = correlations.values del r # Saving the indiced of explored snps keep = [] done = [] for v_ in range(len(variants)): if v_ not in done: # Filtering out explored columns nextcolumns = set(np.arange(len(variants))) - set(done) filter_0 = np.zeros(len(variants)) filter_0[list(nextcolumns)] = 1 # Filtering the columns with high correlation filter_1 = np.greater(correlations[:, v_], thold) filter_1 = filter_1 * np.equal(filter_0, 1) if filter_1.sum() > 1: v_ind = np.arange(len(variants))[filter_1] v_ind = np.append(v_ind, v_) v_cadd = cadd[filter_1] v_cadd = np.append(v_cadd, cadd[v_]) # keeping only the snp with highest cadd # if all less than 1, keep none filter_2 = np.equal(v_cadd, v_cadd.max()) if v_cadd.max() > 1: if isinstance(v_ind[filter_2], np.ndarray): keep.append(v_ind[filter_2][0]) else: keep.append(v_ind[filter_2]) for item in v_ind: done.append(item) else: keep.append(v_) done.append(v_) # Filtering final results on the subset to output # ADD FUNCTION TO KEEP KNOWN ELEMENTS HERE loc_unlinked = np.zeros(len(variants)) loc_unlinked[keep] = 1 gn = gn.compress(loc_unlinked, axis=0) variants = variants[keep] cadd = cadd[keep] return gn, variants, cadd
def plot_ld(gn, title, filename): m = al.rogers_huff_r(gn) ** 2 ax = al.plot_pairwise_ld(m) ax.set_title(title) ax.figure.savefig(os.path.join(pcafP, filename), bbox_inches='tight')
def LD(haplotype, pos_vec, size_chr, circular=True, distance_bins=None, gaps_type="short", min_SNP_pairs=300): """ Compute LD for a subset of SNPs drawn with different gap sizes in between them. Gap sizes follow power 2 distribution. The LD is then computed and averaged over different bin (distance_bins) sizes. Parameters ---------- haplotype : numpy 2D array or allel.haplotype SNP matrix where in the first dimension are the SNP (rows) and in the second dimension (columns) are the samples. pos_vec : 1D array array of absolute positions in [0, size_chr]. size_chr : int Size of the chromosome. circular : bool Whether to consider the chromosome circular or not. If circular, the maximum distance between 2 SNPs is thus half the chromosome. distance_bins : int or list LD will be averaged by bins of distances e.g. if distance_bins = [0, 100, 1000, 10000], LD will be averaged for the groups [0,100[, [100, 1000[, and [1000, 10000[ If distance_bins is an int, it defines the number of bins of distances for which to compute the LD The bins are created in a logspace If distance_bins is a list, they will be used instead gaps_type: str Pairs of SNP considered are separated by a given number (gap) of columns. Not all pairs are considered. By defaut (`short`), gaps are power of 2 up to the closest power of 2 of the number of SNP. Meaning that most of the comparisons will be done on close SNPs (short distance). If one wants to sample more at large distance (to test for circularity for instance), use `long` instead of `short` Using `long` will add gaps like: n_SNP - gaps. It will take more time to run. min_SNP_pairs: int Minimum number of pairs of SNP to consider for a given gap size. If the gap size is big enough such that there is less than `min_SNP_pairs` possible pairs, then all pairs are considered. Returns ------- DataFrame Table with the distance_bins as index, and the mean value of """ if isinstance(distance_bins, type(None)) or isinstance(distance_bins, int): if isinstance(distance_bins, int): n_bins = distance_bins - 3 else: n_bins = 17 if circular: distance_bins = np.logspace(2, np.log10(size_chr // 2), n_bins) distance_bins = np.insert( distance_bins, 0, [0, 25, 50, 75]) # add bins at short distances else: distance_bins = np.logspace(2, np.log10(size_chr), n_bins) distance_bins = np.insert(distance_bins, 0, [0, 25, 50, 75]) n_SNP, n_samples = haplotype.shape # gaps are distance between SNPs in term of position in the snp matrix (not in bp) gaps_interval = (2**np.arange(0, np.log2(n_SNP), 1)).astype(int) # log2 scales of intervals if gaps_type.lower() == "long": gaps_interval = np.unique( np.concatenate([ gaps_interval, np.array( list(n_SNP // 2 - gaps_interval[:len(gaps_interval) // 2])[::-1]).astype(int), np.array(list(n_SNP - gaps_interval)[::-1]) ])).astype(int) else: if gaps_type.lower() != "short": logging.warning( "gaps should be either `short` or `long`. Using short instead of f{gaps_type}" ) selected_snps = [] for gi, gap in enumerate(gaps_interval): if circular: max_value = n_SNP else: max_value = n_SNP - gap if max_value < min_SNP_pairs: # min_SNP_pairs : min number of SNP pairs to consider. # if not many possible pairs possible, just take them all directly, # instead of reaching that number after many more random trials snps = np.arange(0, n_SNP, gap) snp_pairs = np.unique([((snps[i] + i) % n_SNP, (snps[i + 1] + i) % n_SNP) for i in range(len(snps) - 1)], axis=0) snp_pairs = np.concatenate([(snp_pairs + i) % n_SNP for i in range(max_value)], axis=0) else: if not circular: snps = np.arange(0, n_SNP, gap) + np.random.randint( 0, (n_SNP - 1) % gap + 1 ) # adding a random start (+1, bc 2nd bound in randint is exlusive) # non overlapping contiguous pairs # snps=[ 196, 1220, 2244] becomes # snp_pairs=[(196, 1220), (1221, 2245)] snp_pairs = np.unique([((snps[i] + i) % n_SNP, (snps[i + 1] + i) % n_SNP) for i in range(len(snps) - 1)], axis=0) # If we don't have enough pairs (typically when gap is large), we add a random rotation until we have at least 300) #count = 0 # remove pairs that are over the edges snp_pairs = snp_pairs[snp_pairs[:, 0] < snp_pairs[:, 1]] else: snps = np.arange(0, n_SNP, gap) + np.random.randint( 0, (n_SNP - 1)) # adding a random start # non overlapping contiguous pairs # snps=[ 196, 1220, 2244] becomes # snp_pairs=[(196, 1220), (1221, 2245)] snp_pairs = np.unique([((snps[i] + i) % n_SNP, (snps[i + 1] + i) % n_SNP) for i in range(len(snps) - 1)], axis=0) last_pair = snp_pairs[-1] while len(snp_pairs) < min(min_SNP_pairs, max_value): #count += 1 #if count % 10 == 0: #print(">> " + str(gap) + " - " + str(len(np.unique(snp_pairs, axis=0))) + " -- "+ str(len(snps) - 1) + "#" + str(count)) #remainder = (n_SNP - 1) % gap if (n_SNP - 1) % gap != 0 else (n_SNP - 1) // gap shift = np.random.randint(1, n_SNP) % n_SNP new_pair = (last_pair + shift) % n_SNP snp_pairs = np.unique(np.concatenate( [snp_pairs, new_pair.reshape(1, 2)]), axis=0) last_pair = new_pair if not circular: snp_pairs = snp_pairs[snp_pairs[:, 0] < snp_pairs[:, 1]] selected_snps.append(snp_pairs) ld = pd.DataFrame() for i, snps_pos in enumerate(selected_snps): if circular: pos_i = pos_vec[snps_pos] min_dist = np.array([ min(np.diff(pi) % size_chr, np.diff(pi[::-1]) % size_chr) for pi in pos_i ]) % size_chr / 2 sd = pd.DataFrame(min_dist, columns=[ "snp_dist" ]) # %size_chr/2 because max distance btw 2 SNP is size_chr/2 else: sd = pd.DataFrame((np.diff(pos_vec[snps_pos])), columns=["snp_dist"]) sd["dist_group"] = pd.cut(sd.snp_dist, bins=distance_bins) sr = [allel.rogers_huff_r(snps)**2 for snps in haplotype[snps_pos]] sd["r2"] = sr sd["gap_id"] = i ld = pd.concat([ld, sd]) ld2 = ld.dropna().groupby("dist_group").agg(mean_dist=('snp_dist', 'mean'), mean_r2=('r2', 'mean'), Count=('r2', 'count'), sem_r2=('r2', 'sem')) return ld2
fig = plt.figure(figsize=(2,16)) pdf = PdfPages("%s/%s_%s.allele_fq.pdf" % (outdir,outcode,l_nom)) # plot ax=sns.heatmap(fq_minor[is_report],vmin=0,vmax=0.5,cmap=sns.light_palette("darkslategray",n_colors=31), yticklabels=oc_snpname_seg[is_report],linewidths=0.8,linecolor="white",annot=True) ax.set_title("ALT fq per pop %s" % l_nom) pdf.savefig(fig,bbox_inches='tight') pdf.close() # Linkage disequilibrium # linkage disequilibrium Rogers and Huff print("LD Rogers & Huff...") ld_rhr = allel.rogers_huff_r(oc_haploty_seg.compress(is_report).to_n_alt(fill=-1)) ld_rhr = squareform(ld_rhr) np.fill_diagonal(ld_rhr,np.nan) # plot pdf = PdfPages("%s/%s_%s.allele_ld_rhr.pdf" % (outdir,outcode,l_nom)) fig = plt.figure(figsize=(16,14)) ax=sns.heatmap(ld_rhr,vmin=-1,vmax=1,cmap=sns.diverging_palette(20,255,s=99,sep=15,l=45,n=31), xticklabels=oc_snpname_seg[is_report],yticklabels=oc_snpname_seg[is_report],linewidths=0.2,linecolor="white",annot=True) ax.set_title("Rogers & Huff $r$ %s" % l_nom) pdf.savefig(fig,bbox_inches='tight') pdf.close() # print table ld_rhr_df = pd.DataFrame(ld_rhr) ld_rhr_df.columns = oc_snpname_seg[is_report] ld_rhr_df.rows = oc_snpname_seg[is_report]
################### LD decay #################### plt.hist(pos, bins=100)[2] maskstart = 4.4e7 maskstop = 4.45e7 #get LD and pairwise distance for a subset of 1000 SNPs np.random.seed(12345) mask = np.logical_and(pos > maskstart, pos < maskstop) dc2 = dc[:, mask] dc2 = dc2[pred['pop'] == "YRI", :] bingen2 = bingen[:, mask] bingen2 = bingen2[pred['pop'] == 'YRI', :] pos2 = pos[mask] #calculate pairwise LD matrices LDr = allel.rogers_huff_r(np.transpose(dc2)) LDg = allel.rogers_huff_r(np.transpose(bingen2)) LDr = spatial.distance.squareform(LDr) LDg = spatial.distance.squareform(LDg) #get bp distances dists = [x - y for x in pos2 for y in pos2] LDr2 = np.concatenate(LDr) LDr2 = np.array(LDr2, dtype="float64") LDr2 = LDr2**2 LDg2 = np.concatenate(LDg) LDg2 = np.array(LDg2, dtype="float64") LDg2 = LDg2**2 #simulation LD
def calculate_r_2( genotypes ): cerr('[I - calculating Rogers-Huff r^2 for {} SNPs ]'.format(len(genotypes))) r = allel.rogers_huff_r(genotypes) r_2 = scipy.spatial.distance.squareform( r**2 ) return r_2