def generate_pca_from_snps(snps, sample_population=None, title="", pop_colors=None, plot_pca3=False): ## Provide sample_population to color dots import allel if sample_population is not None: ## Check if same number of accessions are provided assert sample_population.shape[0] == snps.shape[1] coords, model = allel.randomized_pca(snps, scaler=None) if plot_pca3: fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot(1, 2, 1) plot_pca_coords(coords, model, 0, 1, ax, sample_population, pop_colors) ax = fig.add_subplot(1, 2, 2) plot_pca_coords(coords, model, 2, 3, ax, sample_population, pop_colors) ax.legend(bbox_to_anchor=(1, 1), loc='upper left') fig.suptitle(title, y=1.05) fig.tight_layout() else: fig = plt.figure(figsize=(8, 5)) ax = fig.add_subplot(111) plot_pca_coords(coords, model, 0, 1, ax, sample_population, pop_colors) ax.legend(bbox_to_anchor=(1, 1), loc='upper left') fig.suptitle(title, y=1.05) fig.tight_layout()
def apply_pca(g, outfile, seed, n, s): """ Applies PCA to data and saves low-dimensional coordinates in outfile @Params: g: input data format outfile: path to coordinate file seed: seed for prng n: Number of principal components s: scaler """ coords, _ = allel.randomized_pca(g, n_components=n, scaler=s, random_state=seed) logging.info( f"Applied PCA with {n} components with scaler {s} and seed {seed}") np.savetxt(outfile, coords) logging.info(f"Saved coordinates to {outfile}")
def _benchmark_pca(self, gt): # Count alleles at each variant self.benchmark_profiler.start_benchmark('PCA: Count alleles') ac = gt.count_alleles() self.benchmark_profiler.end_benchmark() # Count number of multiallelic SNPs self.benchmark_profiler.start_benchmark('PCA: Count multiallelic SNPs') if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: num_multiallelic_snps = da.count_nonzero( ac.max_allele() > 1).compute() else: num_multiallelic_snps = np.count_nonzero(ac.max_allele() > 1) self.benchmark_profiler.end_benchmark() del num_multiallelic_snps # Count number of biallelic singletons self.benchmark_profiler.start_benchmark( 'PCA: Count biallelic singletons') if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: num_biallelic_singletons = da.count_nonzero( (ac.max_allele() == 1) & ac.is_singleton(1)).compute() else: num_biallelic_singletons = np.count_nonzero((ac.max_allele() == 1) & ac.is_singleton(1)) self.benchmark_profiler.end_benchmark() del num_biallelic_singletons # Apply filtering to remove singletons and multiallelic SNPs flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) flt_count = np.count_nonzero(flt) self.benchmark_profiler.start_benchmark( 'PCA: Remove singletons and multiallelic SNPs') if flt_count > 0: if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: gf = gt.take(np.flatnonzero(flt), axis=0) else: gf = gt.compress(condition=flt, axis=0) else: # Don't apply filtering print( '[Exec][PCA] Cannot remove singletons and multiallelic SNPs as no data would remain. Skipping...' ) gf = gt self.benchmark_profiler.end_benchmark() del ac, flt, flt_count # Transform genotype data into 2-dim matrix self.benchmark_profiler.start_benchmark( 'PCA: Transform genotype data for PCA') gn = gf.to_n_alt() self.benchmark_profiler.end_benchmark() del gf # Randomly choose subset of SNPs if self.bench_conf.pca_subset_size == -1: print('[Exec][PCA] Including all ({}) variants for PCA.'.format( gn.shape[0])) gnr = gn else: n = min(gn.shape[0], self.bench_conf.pca_subset_size) print( '[Exec][PCA] Including {} random variants for PCA.'.format(n)) vidx = np.random.choice(gn.shape[0], n, replace=False) vidx.sort() if self.bench_conf.genotype_array_type in [ config.GENOTYPE_ARRAY_NORMAL, config.GENOTYPE_ARRAY_CHUNKED ]: gnr = gn.take(vidx, axis=0) elif self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: gnr = gn[ vidx] # Use indexing workaround since Dask Array's take() method is not working properly else: print( '[Exec][PCA] Error: Unspecified genotype array type specified.' ) exit(1) del vidx if self.bench_conf.pca_ld_enabled: if self.bench_conf.genotype_array_type != config.GENOTYPE_ARRAY_DASK: # Apply LD pruning to subset of SNPs size = self.bench_conf.pca_ld_pruning_size step = self.bench_conf.pca_ld_pruning_step threshold = self.bench_conf.pca_ld_pruning_threshold n_iter = self.bench_conf.pca_ld_pruning_number_iterations self.benchmark_profiler.start_benchmark( 'PCA: Apply LD pruning') gnu = self._pca_ld_prune(gnr, size=size, step=step, threshold=threshold, n_iter=n_iter) self.benchmark_profiler.end_benchmark() else: print( '[Exec][PCA] Cannot apply LD pruning because Dask genotype arrays do not support this operation.' ) gnu = gnr else: print('[Exec][PCA] LD pruning disabled. Skipping this operation.') gnu = gnr # Run PCA analysis pca_num_components = self.bench_conf.pca_number_components scaler = self.bench_conf.pca_data_scaler if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Rechunk Dask array to work with Dask's svd function (single chunk for transposed column) gnu_pca_conv = gnu.rechunk({0: -1, 1: gt.values.chunksize[1]}) else: gnu_pca_conv = gnu # Run conventional PCA analysis self.benchmark_profiler.start_benchmark( 'PCA: Run conventional PCA analysis (scaler: {})'.format( scaler if scaler is not None else 'none')) coords, model = allel.pca(gnu_pca_conv, n_components=pca_num_components, scaler=scaler) if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: coords.compute() self.benchmark_profiler.end_benchmark() del gnu_pca_conv, coords, model if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: # Rechunk Dask array to match original genotype chunk size gnu_pca_rand = gnu.rechunk( (gt.values.chunksize[0], gt.values.chunksize[1])) else: gnu_pca_rand = gnu # Run randomized PCA analysis self.benchmark_profiler.start_benchmark( 'PCA: Run randomized PCA analysis (scaler: {})'.format( scaler if scaler is not None else 'none')) coords, model = allel.randomized_pca(gnu_pca_rand, n_components=pca_num_components, scaler=scaler) if self.bench_conf.genotype_array_type == config.GENOTYPE_ARRAY_DASK: coords.compute() self.benchmark_profiler.end_benchmark() del gnu_pca_rand, coords, model
fig_pca(coords2allVars, model2allVars, 'Conventional PCA without LD pruning', pops= ids['nest'], pcols= nest_cols, filename= 'pca_all.png') # pca with LD pruning, without Patterson's scaling coords3vars, model3vars = al.pca(gnuVars, n_components=10, scaler=None) # pops #fig_pca(coords3vars, model3vars, 'Conventional PCA LD-pruned variants without variance scaling', pops = ids['pops'], pcols= pop_cols) # nests fig_pca(coords3vars, model3vars, 'Conventional PCA LD-pruned variants without variance scaling.', pops = ids['nest'], pcols= nest_cols, filename= 'pca_LDprune_noPatterson.png') # randomized PCA with LD pruning coords5vars, model5vars = al.randomized_pca(gnuVars, n_components=10, scaler='patterson') # pops #fig_pca(coords5vars, model5vars, 'Randomized PCA', pops= ids['pops'], pcols= pop_cols) # nests fig_pca(coords5vars, model5vars, 'Randomized PCA LD-pruned variants', pops= ids['nest'], pcols= nest_cols, filename= 'pca_LDprune_rand.png') plotHeatPCs(coords1var, ids['nest'], PCs=5, filename= 'pca_LDprune_Heat.png') plotHeatPCs(coords2allVars, ids['nest'], PCs=5, filename= 'pca_all_Heat.png') ## get the Eigen values for PCAs # for all (segreg.) vars
## pca without LD pruning for the random subset of 100k loci with Patterson's scaling coords2, model2 = al.pca(gnr, n_components=10, scaler='patterson') fig_pca(coords2, model2, 'Figure 5. Conventional PCA without LD pruning.', pops = ids['pops'], pcols= pop_colours) ## now for the full set (gtseg) with Patterson's scaling # NOTE: probably do not run this on your laptop coords2all, model2all = al.pca(nAltSub, n_components=10, scaler='patterson') fig_pca(coords2all, model2all, 'Conventional PCA without LD pruning.', pops = ids['pops'], pcols= pop_colours) ## pca + LD-pruning, without Patterson's scaling coords3, model3 = al.pca(gnu, n_components=10, scaler=None) fig_pca(coords3, model3, 'Figure 6. Conventional PCA without variance scaling.', pops = ids['pops'], pcols= pop_colours) ## randomized PCA with LD-pruning and Patterson's scaling coords5, model5 = al.randomized_pca(gnu, n_components=10, scaler='patterson') fig_pca(coords5, model5, 'Figure 8. Randomized PCA.', pops = ids['pops'], pcols= pop_colours) ## pca with even sample sizes NOTE: not really needed here, see alimanfoo's Fast PCA site # (https://alimanfoo.github.io/2015/09/28/fast-pca.html) # also: see alimanfoo's Fast-PCA post on an evaluation of the lower PCs in randomized PCA ## plot a heatmap where the color bar basically represents the correlation between the individuals and the respective principal component. # NOTE: Comparisons across PCs are somewhat meaningless here, as the color only shows strength of correlation within a PC, not across PCs. def plotHeatPCs(coords, ids, PCs=4): df = pd.DataFrame(coords[:,0:PCs].T, columns=ids, index= range(1,PCs+1)) plt.subplots(figsize= (20,5))