def pca_fx(gnu, meta, nchr, pop2color, pcAll, population, bykary=False): """ gnu: genotype object transformed with .to_n_alt() pcAll: """ # PCA coords, model = allel.pca(gnu, n_components=10, scaler='patterson') # corrds, model = allel.randomized_pca(gnu, n_components=10, # scaler='patterson') title = "PCA Chr:{}, var:{}".format(nchr, gnu.shape[0]) if population is 'All': samples = meta.Population.values else: samples = meta.Population[meta.Population.isin(population)].values if bykary: s = meta.Population[meta.Population.isin(samples)].index.tolist() samples = meta.ChromForm[s].values pop2color['Kiribina'] = '#FF0000' pop2color['Folonzo'] = '#008000' if pcAll: i = 0 while i < 9: fig_pca(coords, model, title, samples, pop2color, i, i + 1) i += 2 else: fig_pca(coords, model, title, samples, pop2color, 0, 1) hist_var(model, title) return (coords, model)
def run_pca(inversion, vtbl, genotypes, variance_threshold=0.15, min_count=3, whole_inversion=True, buffer=0, samples_bool=None, inversionDict=inversionDict): sites = construct_filter_expression(inversion, inversionDict, whole_inversion=whole_inversion, buffer=buffer) sites_bool = vtbl.eval(sites) alt_alleles, _ =\ filter_and_convert_genotypes(genotypes, sites_boolean=sites_bool, samples_boolean=samples_bool, min_count=min_count, variance_threshold=variance_threshold) coords, model = allel.pca(alt_alleles) return coords, model
def pca(geno, chrom, ploidy, dataset, populations, samples, pop_colours, prune=True, scaler=None): if prune is True: if ploidy > 1: geno = geno.to_n_alt() geno = ld_prune(geno, size=500, step=200, threshold=0.2) else: if ploidy > 1: geno = geno.to_n_alt() coords1, model1 = allel.pca(geno, n_components=10, scaler=scaler) fig_pca(coords1, model1, f"PCA {chrom} {dataset}", f"results/variantAnalysis/pca/PCA-{chrom}-{dataset}", samples, pop_colours, sample_population=populations)
def runPCA(genotypes, **kwargs): pca = allel.pca(genotypes, n_components=2)[0] centroid = np.mean(pca, axis=0) df1 = pd.DataFrame(pca, columns=['x', 'y']) #dst = distance.euclidean(pca[0], centroid) df1['dst'] = df1.apply( lambda x: distance.euclidean([x['x'], x['y']], centroid), axis=1) indices_pop1 = kwargs['pop_1'] indices_pop2 = kwargs['pop_2'] group_mean = np.mean(df1.loc[indices_pop1, 'dst']) focal_mean = df1.loc[indices_pop2, 'dst'] dist = focal_mean / group_mean return (dist)
def pca(directory, outfn, column, newVCF=False, samples=None, bs=20000): """ main function to run pca visualization """ gn, callset = prepData(directory, outfn, newVCF, samples, bs) ## get metadata df = fp.retrieveMetaData(samples, directory, outfn) coords1, model1 = allel.pca(gn, n_components=10, scaler='patterson') fig_pca(coords1, model1, 'Conventional PCA.', sample_population=df[column]) #plt.show() plt.savefig(directory + "graphics/" + outfn + "_pca.jpg")
def sim_load_h5_to_PCA(h5_path): ''' load dataset from h5 format file, remove non-informative columns, fit a PCA input: path file output:PCA coordenates ''' callset = h5py.File(h5_path, mode='r') #Reference: http://alimanfoo.github.io/2015/09/28/fast-pca.html g = allel.GenotypeChunkedArray(callset['calldata/GT']) ac = g.count_alleles()[:] # remove singletons and multiallelic SNPs. Singletons are not informative for PCA, flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) gf = g.compress(flt, axis=0) # transform the genotype data into a 2-dimensional matrix where each cell has the number of non-reference alleles per call gn = gf.to_n_alt() #Removing correlated features (LD pruning): each SNP is a feature, SNPs tend to be correlated #It takes a while 5:15- def ld_prune(gn, size, step, threshold=.1, n_iter=1): for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i + 1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn #more than 3 does not remove almost anything gnu = ld_prune(gn, size=500, step=200, threshold=.1, n_iter=3) #PCA k = 2 coords1, model1 = allel.pca(gnu, n_components=k, scaler='patterson') np.savetxt('data_s//tgp_pca' + str(k) + '.txt', coords1, delimiter=',') return coords1
def pca(genotypes_012, subpops): """Carries out ld pruning and Patterson PCA of the genotypes. :param genotypes_012, genotype matrix in 012 (scikit-allel alt_n format) :param subpops, dictionary of subpopulation indexes :returns pd.DataFrame """ genotypes_012 = sim.utils.monomorphic_012_filter(genotypes_012) genotypes_012 = sim.utils.ld_prune(genotypes_012) coords, model = allel.pca(genotypes_012, n_components=2, scaler='patterson') pca_data = pd.DataFrame({ "pc1": coords[:, 0], "pc2": coords[:, 1], "population": "" }) for pop in ["domestic", "wild", "captive"]: pca_data.loc[subpops[pop], "population"] = pop return pca_data
def pca(directory, outfn, column, newVCF=False, samples=None, bs=20000): """ main function to run pca visualization """ import pdb #gn, callset = prepData(directory, outfn, newVCF, samples, bs) callset = allel.read_vcf(directory + outfn + ".vcf") g = allel.GenotypeChunkedArray(callset['calldata/GT']) gn = transform(g) ## get metadata df = fp.retrieveMetaData(samples, directory, outfn) coords1, model1 = allel.pca(gn, n_components=10, scaler='patterson') fig_pca(directory, outfn, coords1, model1, 'Conventional PCA.', sample_population=df[column])
def _benchmark_pca(self, gt): # Count alleles at each variant self.benchmark_profiler.start_benchmark('PCA: Count alleles') ac = gt.count_alleles() self.benchmark_profiler.end_benchmark() # Count number of multiallelic SNPs self.benchmark_profiler.start_benchmark('PCA: Count multiallelic SNPs') if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: num_multiallelic_snps = da.count_nonzero( ac.max_allele() > 1).compute() else: num_multiallelic_snps = np.count_nonzero(ac.max_allele() > 1) self.benchmark_profiler.end_benchmark() del num_multiallelic_snps # Count number of biallelic singletons self.benchmark_profiler.start_benchmark( 'PCA: Count biallelic singletons') if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: num_biallelic_singletons = da.count_nonzero( (ac.max_allele() == 1) & ac.is_singleton(1)).compute() else: num_biallelic_singletons = np.count_nonzero((ac.max_allele() == 1) & ac.is_singleton(1)) self.benchmark_profiler.end_benchmark() del num_biallelic_singletons # Apply filtering to remove singletons and multiallelic SNPs flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) flt_count = np.count_nonzero(flt) self.benchmark_profiler.start_benchmark( 'PCA: Remove singletons and multiallelic SNPs') if flt_count > 0: if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: gf = gt.take(np.flatnonzero(flt), axis=0) else: gf = gt.compress(condition=flt, axis=0) else: # Don't apply filtering print( '[Exec][PCA] Cannot remove singletons and multiallelic SNPs as no data would remain. Skipping...' ) gf = gt self.benchmark_profiler.end_benchmark() del ac, flt, flt_count # Transform genotype data into 2-dim matrix self.benchmark_profiler.start_benchmark( 'PCA: Transform genotype data for PCA') gn = gf.to_n_alt() self.benchmark_profiler.end_benchmark() del gf # Randomly choose subset of SNPs if self.bench_conf.pca_subset_size == -1: print('[Exec][PCA] Including all ({}) variants for PCA.'.format( gn.shape[0])) gnr = gn else: n = min(gn.shape[0], self.bench_conf.pca_subset_size) print( '[Exec][PCA] Including {} random variants for PCA.'.format(n)) vidx = np.random.choice(gn.shape[0], n, replace=False) vidx.sort() if self.bench_conf.genotype_array_type in [ config.GENOTYPE_ARRAY_NORMAL, config.GENOTYPE_ARRAY_CHUNKED ]: gnr = gn.take(vidx, axis=0) elif self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: gnr = gn[ vidx] # Use indexing workaround since Dask Array's take() method is not working properly else: print( '[Exec][PCA] Error: Unspecified genotype array type specified.' ) exit(1) del vidx if self.bench_conf.pca_ld_enabled: if self.bench_conf.genotype_array_type != config.GENOTYPE_ARRAY_DASK: # Apply LD pruning to subset of SNPs size = self.bench_conf.pca_ld_pruning_size step = self.bench_conf.pca_ld_pruning_step threshold = self.bench_conf.pca_ld_pruning_threshold n_iter = self.bench_conf.pca_ld_pruning_number_iterations self.benchmark_profiler.start_benchmark( 'PCA: Apply LD pruning') gnu = self._pca_ld_prune(gnr, size=size, step=step, threshold=threshold, n_iter=n_iter) self.benchmark_profiler.end_benchmark() else: print( '[Exec][PCA] Cannot apply LD pruning because Dask genotype arrays do not support this operation.' ) gnu = gnr else: print('[Exec][PCA] LD pruning disabled. Skipping this operation.') gnu = gnr # Run PCA analysis pca_num_components = self.bench_conf.pca_number_components scaler = self.bench_conf.pca_data_scaler if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Rechunk Dask array to work with Dask's svd function (single chunk for transposed column) gnu_pca_conv = gnu.rechunk({0: -1, 1: gt.values.chunksize[1]}) else: gnu_pca_conv = gnu # Run conventional PCA analysis self.benchmark_profiler.start_benchmark( 'PCA: Run conventional PCA analysis (scaler: {})'.format( scaler if scaler is not None else 'none')) coords, model = allel.pca(gnu_pca_conv, n_components=pca_num_components, scaler=scaler) if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: coords.compute() self.benchmark_profiler.end_benchmark() del gnu_pca_conv, coords, model if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Rechunk Dask array to match original genotype chunk size gnu_pca_rand = gnu.rechunk( (gt.values.chunksize[0], gt.values.chunksize[1])) else: gnu_pca_rand = gnu # Run randomized PCA analysis self.benchmark_profiler.start_benchmark( 'PCA: Run randomized PCA analysis (scaler: {})'.format( scaler if scaler is not None else 'none')) coords, model = allel.randomized_pca(gnu_pca_rand, n_components=pca_num_components, scaler=scaler) if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: coords.compute() self.benchmark_profiler.end_benchmark() del gnu_pca_rand, coords, model
engine = stdpopsim.get_engine('msprime') sim = engine.simulate(model, new_contig, simsamples, seed=12345) sim_gen = allel.HaplotypeArray(sim.genotype_matrix()).to_genotypes(ploidy=2) sim_pos = np.array([s.position for s in sim.sites()], dtype="int32") m2 = np.isin(sim_pos, keep) sim_gen = sim_gen[m2, :, :] sim_pos = sim_pos[m2] # sim_gen=sim_gen[sim_pos<3.8e7,:,:] # sim_pos=sim_pos[sim_pos<3.8e7] sim_dc_all, sim_dc, sim_ac_all, sim_ac, sim_pos = filter_genotypes( sim_gen, sim_pos) ##################### comparing PCA of real vs generated genotypes ####################### realpca = allel.pca( np.transpose(dc) * 2, scaler=None, n_components=2 ) #*2 here to rescale real genotypes back to 0/1/2 to match binomial(2,...) used to bin genotypes. genpca = allel.pca(np.transpose(bingen), scaler=None, n_components=2) simpca = allel.pca(sim_dc, scaler=None, n_components=6) sampledata = pd.read_csv("data/1kg/sample_metadata.txt", sep="\t") df = pd.DataFrame(np.hstack((realpca[0], genpca[0]))) df.columns = ['realPC1', 'realPC2', 'genPC1', 'genPC2'] df['sampleID'] = samples df = df.merge(sampledata, on='sampleID') df.to_csv('out/1kg/1kg_decoder_PCA.csv', sep=",", index=False) simdf = pd.DataFrame(simpca[0]) simdf.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6'] simdf['pop'] = np.concatenate( [np.repeat("YRI", 50), np.repeat("CEU", 50),
def plot(self, pcs=[1, 2], ax=None, cmap=None, cdict=None, legend=True, title=None, outfile=None): """ Do the PCA and plot it. Parameters --------- pcs: list of ints ... ax: matplotlib axis ... cmap: matplotlib colormap ... cdict: dictionary mapping pop names to colors ... legend: boolean, whether or not to show the legend """ ## Specify which 2 pcs to plot, default is pc1 and pc2 pc1 = pcs[0] - 1 pc2 = pcs[1] - 1 if pc1 < 0 or pc2 > self.ncomponents - 1: raise IPyradError("PCs are 1-indexed. 1 is min & {} is max".format(self.ncomponents)) ## Convert genotype data to allele count data ## We do this here because we might want to try different ways ## of accounting for missing data and "alt" allele counts treat ## missing data as "ref" allele_counts = self.genotypes.to_n_alt() ## Actually do the pca if self.ncomponents > len(self.samples_vcforder): self.ncomponents = len(self.samples_vcforder) print(" INFO: # PCs < # samples. Forcing # PCs = {}".format(self.ncomponents)) coords, model = allel.pca(allele_counts, n_components=self.ncomponents, scaler='patterson') self.pcs = pd.DataFrame(coords, index=self.samples_vcforder, columns=["PC{}".format(x) for x in range(1,self.ncomponents+1)]) ## Just allow folks to pass in the name of the cmap they want to use if isinstance(cmap, str): try: cmap = cm.get_cmap(cmap) except: raise IPyradError(" Bad cmap value: {}".format(cmap)) if not cmap and not cdict: if not self.quiet: print(" Using default cmap: Spectral") cmap = cm.get_cmap('Spectral') if cmap: if cdict: print(" Passing in both cmap and cdict defaults to using the cmap value.") popcolors = cmap(np.arange(len(self.pops))/len(self.pops)) cdict = {i:j for i, j in zip(self.pops.keys(), popcolors)} fig = "" if not ax: fig = plt.figure(figsize=(6, 5)) ax = fig.add_subplot(1, 1, 1) x = coords[:, pc1] y = coords[:, pc2] for pop in self.pops: ## Don't include pops with no samples, it makes the legend look stupid ## TODO: This doesn't prevent empty pops from showing up in the legend for some reason. if len(self.pops[pop]) > 0: mask = np.isin(self.samples_vcforder, self.pops[pop]) ax.plot(x[mask], y[mask], marker='o', linestyle=' ', color=cdict[pop], label=pop, markersize=6, mec='k', mew=.5) ax.set_xlabel('PC%s (%.1f%%)' % (pc1+1, model.explained_variance_ratio_[pc1]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (pc2+1, model.explained_variance_ratio_[pc2]*100)) if legend: ax.legend(bbox_to_anchor=(1, 1), loc='upper left') if fig: fig.tight_layout() if title: ax.set_title(title) if outfile: try: plt.savefig(outfile, format="png", bbox_inches="tight") except: print(" Saving pca.plot() failed to save figure to {}".format(outfile)) return ax
'A6': sns.color_palette()[2], 'N1': sns.color_palette()[3], 'N4': sns.color_palette()[4], 'N6': sns.color_palette()[5], 'S1': sns.color_palette()[6], 'S2': sns.color_palette()[7], 'S5': sns.color_palette()[8] } ############ # nests # PCA using SVD - LD-pruned data (59544 loci) coords1var, model1var = al.pca(gnuVars, n_components=10, scaler='patterson') fig_pca(coords1var, model1var, 'LD-pruned PCA', pops = ids['nest'], pcols= nest_cols, filename= 'pca_LDprune.png') # which one is the outlier in the LD-pruned PCA? ##np.where(coords1var[:,0] > 200) ##ids.iloc[59] # 101a_S1 ###### # pca without LD pruning (random subset of 100000 loci) coords2var, model2var = al.pca(gnrVars, n_components=10, scaler='patterson') # pops #fig_pca(coords2var, model2var, 'Conventional PCA', pops = ids['pops'], pcols= pop_cols) # nests
subsample_nodes = [a for b in subsample_nodes for a in b] #flatten the list subsample_nodes = np.sort(np.array(subsample_nodes)) ts = ts.simplify(subsample_nodes) ts = msp.mutate(ts, args.mu) #get haplotypes and locations haps = ts.genotype_matrix() sample_inds = np.unique([ts.node(j).individual for j in ts.samples()]) locs = [[ts.individual(x).location[0], ts.individual(x).location[1]] for x in sample_inds] #run a PCA genotype_counts = allel.HaplotypeArray(haps).to_genotypes( ploidy=2).to_allele_counts() pca = allel.pca(genotype_counts[:, :, 0]) pcfile = open(os.path.join(args.outdir, simname) + ".pca", "w") for i in range(args.nSamples): pcfile.write("msp_" + str(i) + " " + "msp_" + str(i) + " ") for j in range(10): pcfile.write(str(pca[0][i][j]) + " ") pcfile.write("\n") pcfile.close() #write to VCF with open(os.path.join(args.outdir, simname) + ".vcf", "w") as vcf_file: ts.write_vcf(vcf_file, 2) #convert vcf to .ped (throwing error for opening temp files when run from command line on mac... switch to manual ped file creation?) sp.check_output([ args.vcftools_path, "--vcf",
) #encoder.predict() returns [mean,sd,sample] for normal distributions describing sample locations in latent space, so [0] is fixed but [2] is stochastic given a set of weights. #binning with binomial draws def binomialBinGenotypes(pgen): out = np.copy(pgen) for i in range(out.shape[0]): out[i, :] = np.random.binomial(2, out[i, :]) return out bingen = binomialBinGenotypes(pgen) #comparing PCA of real vs generated genotypes realpca = allel.pca( np.transpose(dc) * 2, scaler="Patterson", n_components=2 ) #*2 here to rescale genotypes to 0/1/2 to match binomial(2,...) used to bin genotypes. #genpca=allel.pca(np.transpose(bingen),scaler=None,n_components=2)[0] #run a separate PCA genpca = realpca[1].transform(np.transpose( bingen)) #project generated coordinates into the "real" PC space sampledata = pd.read_csv("data/hgdp/hgdp_sample_data.txt", sep="\t") df = pd.DataFrame(np.hstack((realpca[0], genpca))) df.columns = ['realPC1', 'realPC2', 'genPC1', 'genPC2'] df['sampleID'] = samples df = df.merge(sampledata, on='sampleID') df.to_csv('pca_decoder_test.csv', sep=",", index=False) fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2) fig.set_figwidth(6) fig.set_figheight(2.75) ax1.scatter(df['realPC1'], df['realPC2'], c=pd.factorize(df['region'])[0])
n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn gnu = ld_prune(nAltSub, size=200, step=50, threshold=.1, n_iter=5) plot_ld(gnu[:1000], 'Figure 3. Pairwise LD after LD pruning.') ############### ## PCA using Singular Value Decomposition (SVD) and Patterson's scaling on LD-pruned data (see gnu.shape for dimensions) coords1, model1 = al.pca(gnu, n_components=10, scaler='patterson') populations = ids['pops'].unique() pop_colours = { 'A': sns.color_palette()[0], 'N': sns.color_palette()[1], 'S': sns.color_palette()[2], } def plot_pca_coords(coords, model, pc1, pc2, ax, pops, pcols): sns.despine(ax=ax, offset=5) x = coords[:, pc1] y = coords[:, pc2] for pop in pops.unique():
p['sd2'] = pred[1][:, 1] pred = p else: pred = pd.DataFrame(pred[0]) pred.columns = ['LD' + str(x + 1) for x in range(len(pred.columns))] pred['sampleID'] = samples pred.to_csv(out + '_latent_coords.txt', sep='\t', index=False) if not save_weights: subprocess.check_output(['rm', out + "_weights.hdf5"]) if PCA: pcdata = np.transpose(dc) t1 = time.time() print("running PCA") pca = allel.pca(pcdata, scaler=PCA_scaler, n_components=n_pc_axes) pca = pd.DataFrame(pca[0]) colnames = ['PC' + str(x + 1) for x in range(n_pc_axes)] pca.columns = colnames pca['sampleID'] = samples pca.to_csv(out + "_pca.txt", index=False, sep="\t") t2 = time.time() pcatime = t2 - t1 print("PCA run time: " + str(pcatime) + " seconds") ######### plots ######### #training history #plt.switch_backend('agg') fig = plt.figure(figsize=(3, 1.5), dpi=200) plt.rcParams.update({'font.size': 7}) ax1 = fig.add_axes([0, 0, 1, 1])
sorted(muta.keys()) # %% muta['calldata/GT'].shape[0] # %% gt = muta['calldata/GT'] gt = allel.GenotypeArray(gt) len(gt) # %% gn = gt.to_n_alt() gn # %% coords1, model1 = allel.pca(gn, n_components=10, scaler=None) # %% df_samples = pandas.read_csv('LL_pop.txt', delimiter='\t', header=None) df_samples.head() # %% populations = df_samples.iloc[:, 1].unique() len(populations) # %% pop_colours = { 'French.alps': '#FF0000', 'E.Greenland': '#008000', 'Iceland': '#00FFFF', 'W.Greenland': '#90EE90',
np.savetxt(os.path.join(args.outdir,simname)+"_locs.txt",locs) #run a PCA genotype_counts=allel.HaplotypeArray(haps).to_genotypes(ploidy=2).to_allele_counts() #add arg for n pc's to keep, default is 10 #LD pruning function def ld_prune(gn, size, step, threshold=.1, n_iter=1): #via http://alimanfoo.github.io/2015/09/28/fast-pca.html for i in range(n_iter): loc_unlinked = allel.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn genotype_counts_pruned=ld_prune(genotype_counts[:,:,1],200,100,.1,1) pca=allel.pca(genotype_counts_pruned,n_components=10) varexp=pca[1].explained_variance_ratio_ np.savetxt(os.path.join(args.outdir,simname)+".pca_var_explained",varexp) #write out proportion variance explained by PCs pcfile=open(os.path.join(args.outdir,simname)+".pca","w") for i in range(args.nSamples): pcfile.write("msp_"+str(i)+" "+"msp_"+str(i)+" ") for j in range(10): pcfile.write(str(pca[0][i][j])+" ") pcfile.write("\n") pcfile.close() #write to VCF with open(os.path.join(args.outdir,simname)+".vcf","w") as vcf_file: ts.write_vcf(vcf_file,2)