def subcluster_3_cluster_QC_DE(adata, res, tit): # 1)clustering leiden_tit = "leiden_res_"+str(res) pg.leiden(adata, rep="pca_harmony", resolution= res, class_label=leiden_tit) # # 2) QC plots sc.set_figure_params(figsize=(10,10), fontsize=30) sc.pl.umap(adata, color=[leiden_tit, "method"], size=35, show=False) plt.suptitle(tit, fontsize=50) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() sc.set_figure_params(figsize=(16, 12), fontsize=30) sc.pl.umap(adata, color=["patient", "sample"], size=95, show=False) plt.suptitle(tit, fontsize=50) plt.show() sc.pl.umap(adata, size=95, color=['n_genes', 'n_counts', 'percent_mito'], show=False) plt.suptitle(tit, fontsize=50) plt.show() composition_barplot(adata, leiden_tit, "sample", tit+" - Sample count per cluster") composition_barplot(adata, leiden_tit, "patient", tit+" - Patient count per cluster") # 3) DE genes pg.de_analysis(adata, leiden_tit) markers = pg.markers(adata) pg.write_results_to_excel(markers, "DE_genes_"+ tit +".xlsx") filt_markers = filter_markers(markers, 0.75, False) filehandler = open("FiltMarkers"+tit+".pckl", "wb") pickle.dump(filt_markers, filehandler) filehandler.close() return filt_markers
def viral_enrichment_umap(adata, enrichment_score, save_str="", title="", all_or_subclustering="all"): # setting colors adata = adata.copy() data = adata.obs[enrichment_score] num_levels = 20 vmin, midpoint, vmax = data.min(), 0, data.max() levels = np.linspace(vmin, vmax, num_levels) midp = np.mean(np.c_[levels[:-1], levels[1:]], axis=1) vals = np.interp(midp, [vmin, midpoint, vmax], [0, 0.5, 1]) colors = plt.cm.coolwarm(vals) cmap, norm = matplotlib.colors.from_levels_and_colors(levels, colors) adata.obs["Viral+"] = pd.Categorical(adata.obs["Viral+"]) plt.clf() sc.set_figure_params(figsize=(5, 5)) fig, ax = plt.subplots(1, 1) sc.pl.umap(adata, color=enrichment_score, color_map=cmap, title=title +" "+ enrichment_score, show=False, ax=ax) size=23 if all_or_subclustering!="all": size=50 sc.pl.umap(adata[adata.obs["Viral+"]==True,:], color="Viral+", palette=["black"], size=size, show=False, ax=ax, legend_loc=None, title=title) plt.tight_layout() if save_str == "": plt.show() else: plt.savefig("figures/"+save_str+"_viral_umap.pdf")
def filter_abundant_barcodes(adata, filter_cells=False, threshold=1000, library_id='', save_path='./figures/'): ''' Plots a weighted histogram of transcripts per cell barcode for guiding the placement of a filtering threshold. Returns a filtered version of adata. ''' # if necessary, create the output directory if not os.path.isdir(save_path): os.makedirs(save_path) # use adata.uns['library_id'] if it exists if not library_id: if 'library_id' in adata.uns: library_id = adata.uns['library_id'] # Sum total UMI counts and genes for each cell-barcode, save to obs counts = np.array(adata.X.sum(1)) genes = np.array(adata.X.astype(bool).sum(axis=1)) adata.obs['total_counts'] = counts adata.obs['n_genes_by_counts'] = genes ix = counts >= threshold # Plot and format a weighted cell-barcode counts histogram sc.set_figure_params(dpi=100, figsize=[4, 4], fontsize=12) fig = plt.figure() ax = fig.add_subplot(111) ax.hist(counts, bins=np.logspace(0, 6, 100), weights=counts / sum(counts)) ax.set_xscale('log') ax.set_xlabel('Transcripts per cell barcode') ax.set_ylabel('Fraction of total transcripts') ax.set_title(library_id) ax.text(0.99, 0.95, str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained', ha='right', va='center', transform=ax.transAxes) # Overlay the counts threshold as a vertical line ax.plot([threshold, threshold], ax.get_ylim()) # Save figure to file fig.tight_layout() plt.savefig(save_path + 'barcode_hist_' + library_id + '.png') plt.show() plt.close() # Print the number of cell barcodes that will be retained print('Barcode Filtering ' + library_id + ' (' + str(np.sum(ix)) + '/' + str(counts.shape[0]) + ' cells retained)') print() # If requested, return a filtered version of adata if filter_cells: sc.pp.filter_cells(adata, min_counts=threshold, inplace=True) return adata
def marker_analysis(adata,variables=['leiden','region'],markerpath='https://docs.google.com/spreadsheets/d/e/2PACX-1vTz5a6QncpOOO-f3FHW2Edomn7YM5mOJu4z_y07OE3Q4TzcRr14iZuVyXWHv8rQuejzhhPlEBBH1y0V/pub?gid=1154528422&single=true&output=tsv'): sc.set_figure_params(color_map="Purples") import random markerpath=os.path.expanduser(markerpath) markers=pd.read_csv(markerpath,sep="\t") print(markers) markers[markers.keys()[0]]=[str(x) for x in markers[markers.keys()[0]]] markers[markers.keys()[2]]=[str(x).split(',') for x in markers[markers.keys()[2]]] markers[markers.keys()[3]]=[str(x).split(';') for x in markers[markers.keys()[3]]] markers[markers.keys()[3]]=[[str(x).split(',') for x in y] for y in markers[markers.keys()[3]]] uniqueClasses=set([y for x in markers[markers.keys()[2]] for y in x if y!='nan']) uniqueSubClasses=set([z for x in markers[markers.keys()[3]] for y in x for z in y if z!='nan']) comboClasses=[] print(markers) for i in range(markers.shape[0]): rowlist=[] for j in range(len(markers[markers.keys()[2]][i])): for k in markers[markers.keys()[3]][i][j]: rowlist.append(' '.join(filter(lambda x: x != 'nan',[k,markers[markers.keys()[2]][i][j]]))) comboClasses.append(rowlist) markers['fullclass']=comboClasses markers.set_index(markers.keys()[0],inplace=True,drop=False) markers=markers.loc[ [x for x in markers[markers.keys()[0]] if x in adata.var_names],:] uniqueFullClasses=set([y for x in markers['fullclass'] for y in x if y!='nan']) from collections import defaultdict markerDict = defaultdict(list) for x in uniqueFullClasses: for y in markers[markers.keys()[0]]: if x in markers.loc[y,'fullclass']: markerDict[x].append(y) markerDictClass = defaultdict(list) for x in uniqueClasses: for y in markers[markers.keys()[0]]: if x in markers.loc[y,'fullclass']: markerDictClass[x].append(y) markerPlotGroups=[] for k in markerDict.keys(): if len(markerDict[k])>1: print(k) print(len(markerDict[k])) sc.tl.score_genes(adata,gene_list=markerDict[k],score_name=k,gene_pool= markerDict[k]+random.sample(adata.var.index.tolist(),min(4000,adata.var.index.shape[0]))) markerPlotGroups.append(k) adata.uns['marker_groups']=list(markerDict.keys()) for tag in variables: pd.DataFrame(adata.obs.groupby(tag).describe()).to_csv(os.path.join(sc.settings.figdir, tag+"MarkerSumStats.csv")) if 'X_tsne' in adata.obsm.keys(): sc.pl.tsne(adata, color=markerPlotGroups,save="_Marker_Group") sc.pl.umap(adata, color=markerPlotGroups,save="_Marker_Group") print(markerDict) #sc.pl.violin(adata, markerPlotGroups, groupby='leiden',save="_Marker_Group_violins") for i in markerDictClass: print(i) if 'X_tsne' in adata.obsm.keys(): sc.pl.tsne(adata, color=sorted(markerDictClass[i]),save="_"+str(i)+"_Marker") sc.pl.umap(adata, color=sorted(markerDictClass[i]),save="_"+str(i)+"_Marker") return(adata)
def test_violin(): sc.pl.set_rcParams_defaults() sc.set_figure_params(dpi=50, color_map='viridis') pbmc = sc.datasets.pbmc68k_reduced() sc.pl.violin(pbmc, ['n_genes', 'percent_mito', 'n_counts'], stripplot=True, multi_panel=True, jitter=True, show=False) save_and_compare_images('master_violin_multi_panel', tolerance=40)
def plot_zscore_signature(markers, data, pdf, pop_name): sc.set_figure_params(figsize=(10, 8)) gene_set = [gene for gene in markers if gene in data.var_names] data.obs["zscore_" + pop_name] = signature_score_per_cell(data, gene_set) gene_list = "".join([x + "\n" for x in gene_set]) fig = sc.pl.umap(data, color="zscore_"+pop_name, cmap="Purples", size=40, show=False, return_fig=True) plt.suptitle(pop_name+" Z-score") plt.annotate(gene_list, xy=(1.2, 0), xycoords=('axes fraction', 'axes fraction')) pdf.savefig(fig, bbox_inches="tight")
def set_figure_params( self, episcanpy: bool = True, dpi: int = 80, dpi_save: int = 150, frameon: bool = True, vector_friendly: bool = True, fontsize: int = 14, color_map: Optional[str] = None, format: Union[str, Iterable[str]] = "pdf", transparent: bool = False, ipython_format: str = "png2x", ): """\ Set resolution/size, styling and format of figures. Parameters ---------- episcanpy Init default values for :obj:`matplotlib.rcParams` suited for `scanpy` or `epiScanpy`. dpi Resolution of rendered figures - this influences the size of figures in notebooks. dpi_save Resolution of saved figures. This should typically be higher to achieve publication quality. frameon Add frames and axes labels to scatter plots. vector_friendly Plot scatter plots using `png` backend even when exporting as `pdf` or `svg`. fontsize Set the fontsize for several `rcParams` entries. Ignored if `scanpy=False`. color_map Convenience method for setting the default color map. Ignored if `scanpy=False`. format: {`'png'`, `'pdf'`, `'svg'`, etc.}, optional (default: `'pdf'`) This sets the default format for saving figures: `file_format_figs`. transparent Save figures with transparent back ground. Sets `rcParams['savefig.transparent']`. ipython_format Only concerns the notebook/IPython environment; see :func:`~IPython.display.set_matplotlib_formats` for details. """ sc.set_figure_params(self, scanpy=episcanpy, dpi=dpi, dpi_save=dpi_save, frameon=frameon, vector_friendly=vector_friendly, fontsize=fontsize, color_map=color_map, format=format, transparent=transparent, ipython_format=ipython_format)
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Read AnnData object and markers from .csv file. Plot expression \ of markers in each cluster. """) parser.add_argument('-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file.') parser.add_argument( '--markers_csv', action='store', dest='markers_csv', default='none', help='Markers to plot. Must have the following column: hgnc_symbol.') options = parser.parse_args() # Scanpy settings sc.settings.figdir = os.getcwd() # figure output directory to match base. # sc.settings.n_jobs = options.ncpu # number CPUs # sc.settings.max_memory = 500 # in Gb sc.set_figure_params(dpi_save=300) # Get the out file base. out_file_base = os.path.basename( options.markers_csv).rstrip('csv').rstrip('.') # Read in the data adata = sc.read_h5ad(filename=options.h5) # Read in the marker database file df = pd.read_table(options.markers_csv) df = df['hgnc_symbol'] marker_genes_found = adata.var['gene_symbols'][ adata.var['gene_symbols'].isin(df)] # Dotplots _ = sc.pl.dotplot( adata=adata, var_names=marker_genes_found, groupby='cluster', gene_symbols='gene_symbols', dendrogram=True, show=False, standard_scale='var', # Scale color between 0 and 1 use_raw=False, color_map='Blues', save='{}_dotplot.png'.format(out_file_base))
def test_violin(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=40) with plt.rc_context(): sc.pl.set_rcParams_defaults() sc.set_figure_params(dpi=50, color_map='viridis') pbmc = sc.datasets.pbmc68k_reduced() sc.pl.violin( pbmc, ['n_genes', 'percent_mito', 'n_counts'], stripplot=True, multi_panel=True, jitter=True, show=False, ) save_and_compare_images('master_violin_multi_panel') sc.pl.violin( pbmc, ['n_genes', 'percent_mito', 'n_counts'], ylabel=["foo", "bar", "baz"], groupby='bulk_labels', stripplot=True, multi_panel=True, jitter=True, show=False, rotation=90, ) save_and_compare_images('master_violin_multi_panel_with_groupby') # test use of layer pbmc.layers['negative'] = pbmc.X * -1 sc.pl.violin( pbmc, 'CST3', groupby='bulk_labels', stripplot=True, multi_panel=True, jitter=True, show=False, layer='negative', use_raw=False, rotation=90, ) save_and_compare_images('master_violin_multi_panel_with_layer')
def visualAnalyzeData(canvas, fig, analysisMethod, color, paths): fig.clear() axes = fig.add_subplot(111) # sc setting sc.settings.verbosity = 3 sc.set_figure_params(dpi=100, color_map='viridis_r') sc.logging.print_header() # load data, store in AnnData adata = sc.AnnData(X=np.loadtxt(paths['seq_expr']).T) adata.var_names = np.loadtxt(paths['seq_gnames'], dtype=str).tolist() adata.obs_names = np.loadtxt(paths['seq_cnames'], dtype=str).tolist() adata.obs['cell_name'] = np.loadtxt(paths['seq_cnames'], dtype=str).tolist() # print(adata) # print(adata.to_df()) # print(adata.obs.columns) if color == 'cell label': color = 'cell_name' if analysisMethod == "clustering": # based on leiden sc.pp.neighbors(adata, knn=True) # two params sc.tl.leiden(adata, key_added='clustering' ) # by default, using Leiden graph clustering sc.tl.umap(adata) sc.pl.umap(adata, color=color, ax=axes, show=False) if analysisMethod == 'trajectory inference': sc.pp.neighbors(adata, knn=True) # two params sc.tl.louvain(adata, resolution=1.0, key_added='clustering') sc.tl.paga(adata, groups='clustering') sc.pl.paga(adata, color=color, ax=axes, show=False) if analysisMethod == 'dimensionality reduction (pca)': sc.tl.pca(adata, svd_solver='arpack') if color != 'clustering': sc.pl.pca(adata, color=color, ax=axes, show=False) else: tk.messagebox.showwarning( title='Warning', message= 'dimensionality reduction visualization not support clustering!' ) canvas.draw()
def test_violin(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=40) sc.pl.set_rcParams_defaults() sc.set_figure_params(dpi=50, color_map='viridis') pbmc = sc.datasets.pbmc68k_reduced() sc.pl.violin(pbmc, ['n_genes', 'percent_mito', 'n_counts'], stripplot=True, multi_panel=True, jitter=True, show=False) save_and_compare_images('master_violin_multi_panel') sc.pl.violin(pbmc, ['n_genes', 'percent_mito', 'n_counts'], groupby='bulk_labels', stripplot=True, multi_panel=True, jitter=True, show=False) save_and_compare_images('master_violin_multi_panel_with_groupby')
import logging, matplotlib, os, sys import scanpy as sc import matplotlib.pyplot as plt from matplotlib import rcParams from matplotlib import colors import pandas as pd from glbase3 import genelist plt.rcParams['figure.figsize'] = (8, 8) sc.settings.verbosity = 3 sc.set_figure_params(dpi=200, dpi_save=200) matplotlib.rcParams['pdf.fonttype'] = 42 matplotlib.rcParams['font.size'] = 10 sc.settings.figdir = 'diffexp' adata = sc.read('./learned.h5ad') sc.tl.rank_genes_groups(adata, 'leiden_r0.5', method='wilcoxon', n_genes=3000) adata.write('./de.h5ad') adata = sc.read('./de.h5ad') sc.pl.rank_genes_groups(adata, n_genes=25, sharey=True, show=False, save='genes-top25.pdf') sc.pl.rank_genes_groups(adata, key='rank_genes_groups', show=False, save='genes.pdf')
setup() from matplotlib.testing.compare import compare_images import matplotlib.pyplot as plt import numpy as np import pandas as pd from anndata import AnnData import scanpy as sc HERE: Path = Path(__file__).parent ROOT = HERE / '_images' FIGS = HERE / 'figures' sc.pl.set_rcParams_defaults() sc.set_figure_params(dpi=40, color_map='viridis') ##### # Test images are saved under the folder ./figures # if test images need to be updated, simply copy them from # the ./figures folder to ./_images/ def test_heatmap(image_comparer): save_and_compare_images = image_comparer(ROOT, FIGS, tol=15) adata = sc.datasets.krumsiek11() sc.pl.heatmap(adata, adata.var_names, 'cell_type', use_raw=False,
def main(): """Run CLI.""" parser = argparse.ArgumentParser( description=""" Read AnnData object and PCs file. Generates UMAP. """ ) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__) ) parser.add_argument( '-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file.' ) parser.add_argument( '-pc', '--tsv_pcs', action='store', dest='pc', default='', help='Tab-delimited file of PCs for each cell. First column is\ cell_barcode. Subsequent columns are PCs. If "", uses pca\ slot in AnnData.\ (default: "")' ) parser.add_argument( '-cq', '--colors_quantitative', action='store', dest='cq', default='', help='Comma seperated list of quantitative variable names for colors.\ (default: "")' ) parser.add_argument( '-cc', '--colors_categorical', action='store', dest='cc', default='', help='Comma seperated list of categorical variable names for colors.\ (default: "")' ) parser.add_argument( '-npc', '--number_pcs', action='store', dest='npc', default=0, type=int, help='Number of PCs to use.\ (default: maximum number in tsv_pcs file)' ) parser.add_argument( '-nn', '--n_neighbors', action='store', dest='n_neighbors', default='15', type=str, help='Number of neighbors for sc.pp.neighbors call\ (default: %(default)s)' ) parser.add_argument( '-uinit', '--umap_init', action='store', dest='umap_init', default='X_pca', help='How to initialize the low dimensional embedding.\ Valid options: any key for adata.obsm,\ ’paga’: positions from paga(),\ ’spectral’: use a spectral embedding of the graph,\ ’random’: assign initial embedding positions at random.\ (default: X_pca, the slot where tsv_pcs is stored if provided)' ) parser.add_argument( '-umd', '--umap_min_dist', action='store', dest='umap_min_dist', default='0.5', type=str, help='The effective minimum distance between embedded points. Smaller\ values will result in a more clustered/clumped embedding where\ nearby points on the manifold are drawn closer together, while\ larger values will result on a more even dispersal of points.\ The value should be set relative to the spread value, which\ determines the scale at which embedded points will be spread out.\ (default: %(default)s)' ) parser.add_argument( '-us', '--umap_spread', action='store', dest='umap_spread', default='1.0', type=str, help='The minimum distance apart that points are allowed to be in the\ low dimensional representation (effective scale of embedded points\ ). In combination with min_dist this determines how\ clustered/clumped the embedded points are.\ (default: %(default)s)' ) parser.add_argument( '-dln', '--drop_legend_n', action='store', dest='drop_legend', default=-1, type=int, help='Drop the legend for categorical colors with >= drop_legend_n\ categories. If drop_legend_n < 0, then no legend drops are\ performed.\ (default: %(default)s)' ) parser.add_argument( '--force_recalculate_neighbors', action='store_true', dest='calculate_neighbors', default=False, help='Calculate neighbor graph even if it already exists in the\ AnnData (which it my do so if you already ran BBKNN).\ (default: %(default)s)' ) parser.add_argument( '-ncpu', '--number_cpu', action='store', dest='ncpu', default=4, type=int, help='Number of CPUs to use.\ (default: %(default)s)' ) parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: <h5_anndata>-<tsv_pcs>-umap)' ) options = parser.parse_args() # Fixed settings. verbose = True # Scanpy settings sc.settings.figdir = os.getcwd() # figure output directory to match base. sc.settings.n_jobs = options.ncpu # number CPUs # sc.settings.max_memory = 500 # in Gb sc.set_figure_params(dpi_save=300) # Load the AnnData file. adata = sc.read_h5ad(filename=options.h5) # Load the PCs. if options.pc == '': df_pca = pd.DataFrame( data=adata.obsm['X_pca'], index=adata.obs.index, columns=[ 'PC{}'.format(x) for x in range(1, adata.obsm['X_pca'].shape[1]+1) ] ) else: df_pca = pd.read_csv(options.pc, sep='\t', index_col='cell_barcode') # df_pca = pd.read_csv( # 'adata-pcs-harmony.tsv.gz', # sep='\t', # index_col='cell_barcode' # ) # Check that nPCs is valid. n_pcs = options.npc if 'neighbors' in adata.uns and not options.calculate_neighbors: # If we are using the pre-calculated neighbors use the PCs from that n_pcs = adata.uns['neighbors']['params']['n_pcs'] if n_pcs == 0: n_pcs = len(df_pca.columns) elif n_pcs > len(df_pca.columns): raise Exception( '--number_pcs ({}) is > than n_pcs in --tsv_pcs ({}).'.format( n_pcs, len(df_pca.columns) ) ) if verbose: print('Using {} PCs.'.format(n_pcs)) # Subset number of PCs to be exactly nPCs - here we assume PCs are ordered. print('Subetting PCs - we assume they are ordered by column index.') df_pca = df_pca.iloc[:, range(0, n_pcs)] print('PC columns:\t{}'.format(np.array_str(df_pca.columns))) # Add the reduced dimensions to the AnnData object. # NOTE: We need to do this for BBKNN in the case were we init with X_pca adata.obsm['X_pca__umap'] = df_pca.loc[adata.obs.index, :].values.copy() # Get the init position for UMAP umap_init = options.umap_init if umap_init == 'X_pca': umap_init = 'X_pca__umap' # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = '{}-{}-umap'.format( os.path.basename(options.h5.rstrip('h5ad').rstrip('.')), os.path.basename(options.pc.rstrip('tsv.gz').rstrip('.')) ) # Append the parameters to the output file. out_file_base = '{},number_pcs={}'.format( out_file_base, n_pcs ) out_file_base = '{},umap_init={}'.format( out_file_base, options.umap_init ) # Parse the color variables. colors_quantitative = [] if options.cq != '': colors_quantitative = options.cq.split(',') colors_categorical = [] if options.cc != '': colors_categorical = options.cc.split(',') if len(colors_quantitative) == 0 and len(colors_categorical) == 0: raise Exception('Specify a color value.') # Add colors_large_palette to adata.uns. # adata.uns["annotation_colors"] = COLORS_LARGE_PALLETE # Parse the neighbors iterations. list__n_neighbors = [] if options.n_neighbors != '': list__n_neighbors = list(map(int, options.n_neighbors.split(','))) # Parse the min_dist iterations. list__min_dist = [] if options.umap_min_dist != '': list__min_dist = list(map(float, options.umap_min_dist.split(','))) # Parse the neighbors iterations. list__spread = [] if options.umap_spread != '': list__spread = list(map(float, options.umap_spread.split(','))) # Update the out base if only one of any iteration. if len(list__n_neighbors) == 1: out_file_base = '{},n_neighbors={}'.format( out_file_base, list__n_neighbors[0] ) if len(list__min_dist) == 1: out_file_base = '{},umap_min_dist={}'.format( out_file_base, list__min_dist[0] ) if len(list__spread) == 1: out_file_base = '{},umap_spread={}'.format( out_file_base, list__spread[0] ) # Loop over all combinations of the different paramters we want to analyse. list__umap_keys = {} for i__n_neighbors, i__min_dist, i__spread in itertools.product( list__n_neighbors, list__min_dist, list__spread ): # Check input parameters if not (2 <= i__n_neighbors <= 100): # Recommended in parameter documentation: # https://umap-learn.readthedocs.io/en/latest/api.html warnings.warn( 'WARNING: it is suggested to set n_neighbors to a {}'.format( 'value between 2-100.' ) ) if not (0.0 <= i__min_dist <= 1.0): # Recommended here: https://github.com/lmcinnes/umap/issues/249 warnings.warn( 'WARNING: it is suggested to set umap_min_dist to a {}'.format( 'value between 0-1.' ) ) if not (0.0 <= i__spread <= 3.0): # Recommendation based on single cell experience. warnings.warn( 'WARNING: it is suggested to set umap_spread to a {}'.format( 'value between 0-3.' ) ) # Set the plot label. plt__label = 'n_neighbors={}'.format(i__n_neighbors) plt__label = '{},umap_min_dist={}'.format( plt__label, str(i__min_dist).replace('.', 'pt') ) plt__label = '{},umap_spread={}'.format( plt__label, str(i__spread).replace('.', 'pt') ) # Calculate neighbors for on the specified PCs. # By default saved to adata.uns['neighbors'] # # First, however, check to see if adata.uns['neighbors'] already exists # ...and unless the user tells us not to, use that slot, not calculate # neighbors. This default behaviour is to accommodate the instance when # bbknn has been run on the data. if 'neighbors' not in adata.uns or options.calculate_neighbors: sc.pp.neighbors( adata, use_rep='X_pca__umap', n_pcs=n_pcs, n_neighbors=i__n_neighbors, # Scanpy default = 15 copy=False, random_state=0 ) else: warnings.warn( 'WARNING: found neighbors slot in adata.uns. {}'.format( 'Not calculating neighbors (ignoring n_neighbors).' ) ) # If we are using the pre-calculated neighbors drop npcs note. # if 'n_pcs' in adata.uns['neighbors']['params']: # n_pcs = adata.uns['neighbors']['params']['n_pcs'] i__n_neighbors = adata.uns['neighbors']['params']['n_neighbors'] # Save the parameters to a dict list__umap_keys['X_umap__{}'.format(plt__label)] = { 'n_neighbors': i__n_neighbors, 'umap_min_dist': i__min_dist, 'umap_spread': i__spread } adata.uns['neighbors__{}'.format(plt__label)] = adata.uns['neighbors'] # TODO: add paga # # If init with paga, plot paga first - NOTE we can only do this if # if options.umap_init == 'paga' and 'paga' not in adata.uns: # print( # 'Trying to call sc.tl.paga.', # 'NOTE: requires one to have clustered the data.' # ) # sc.tl.paga( # adata, # use_rna_velocity=False, # copy=False # ) # UMAP # Saved to adata.uns['umap'] and adata.obsm['X_umap'] # NOTE: If umap_init == X_pca, then X_umap will have an equal number # of n_components to X_pca (n_components is overridden). sc.tl.umap( adata, n_components=2, min_dist=i__min_dist, # Scanpy default = 0.05 spread=i__spread, # Scanpy default = 1.0 init_pos=umap_init, # Scanpy default = spectral # For some reason cannot access neighbors key slot, thus we # must keep uns['neighbors'] until we have run this. # neighbors_key='neighbors__{}'.format(plt__label), copy=False, random_state=0 ) if 'embedding_density' in colors_quantitative: sc.tl.embedding_density( adata, basis='umap' ) # Rename density estimates adata.obs[ 'umap__{}__density'.format(plt__label) ] = adata.obs.pop('umap_density') adata.uns[ 'umap__{}__density_params'.format(plt__label) ] = adata.uns.pop('umap_density_params') # Rename UMAP adata.uns[ 'umap__{}__params'.format(plt__label) ] = adata.uns.pop('umap') adata.obsm[ 'X_umap__{}'.format(plt__label) ] = adata.obsm.pop('X_umap') # Delete key that we no longer need since already copied and we have # run umap. del adata.uns['neighbors'] # NOTE: If the color var is a gene, you should color by ln(CPM+1). # By default these sc.pl.umap uses the .raw attribute of AnnData # if present which is assumed to be ln(CPM+1). # For each color to plot, loop over the different iterations. for color_var in colors_quantitative: save_plot( adata=adata, list__umap_keys=list__umap_keys, out_file_base=out_file_base, color_var=color_var, colors_quantitative=True, drop_legend=options.drop_legend ) for color_var in colors_categorical: save_plot( adata=adata, list__umap_keys=list__umap_keys, out_file_base=out_file_base, color_var=color_var, colors_quantitative=False, drop_legend=options.drop_legend ) adata.write( '{}.h5ad'.format('test'), compression='gzip' )
# %% [markdown] # container rpy_v3.1 # %% import numpy as np import pandas as pd import scanpy as sc import matplotlib.pyplot as plt import tables as tb import scipy as scipy import dotscore #Specifying random seed import random sc.set_figure_params(color_map='viridis', dpi_save=350) sc.settings.verbosity = 3 # %% # %load_ext autoreload # %autoreload 2 # %% import matplotlib as mpl mpl.rcParams['figure.figsize'] = (4, 4) mpl.rcParams['pdf.fonttype'] = 42 #Ensures readable fonts in illustrator mpl.rcParams['ps.fonttype'] = 42 # %% # Setting up target directories
sample1 = 'pbmc5k' dirPre = '../preprocess_scanpy/write_' + sample1 + '/' dirLeiden = '../leiden_scanpy/write_' + sample1 + '/' dirOut1 = './write_' + sample1 + '/' dirOut = './write_' + sample1 + '/CRclusters9/' dirFig = './figures_' + sample1 + '/CRclusters9/' os.system('mkdir -p ' + dirOut) os.system('mkdir -p ' + dirFig) #%% sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3) #scanpy.set_figure_params(scanpy=True, dpi=80, dpi_save=150, frameon=True, vector_friendly=True, fontsize=14, figsize=None, color_map=None, format='pdf', facecolor=None, transparent=False, ipython_format='png2x') sc.set_figure_params(dpi=150, dpi_save=300, fontsize=20, figsize=[8, 8]) #%% mtxPrefix = '/scratch/dobin/STAR/STARsoloPreprint/maia1/count/' toolsIndex = ['CR', 'Sfu', 'Ssp', 'kb', 'Ask', 'Adf'] #, 'Ase', 'Apa', 'Afu'] tools = pd.DataFrame(index=toolsIndex, columns=['name', 'mtxDir']) tools.name['CR'] = 'CellRanger' tools.mtxDir[ 'CR'] = mtxPrefix + 'CellRanger_5.0.1/human_CR_3.0.0/standard/default/10X/3/pbmc_5k/20/b02/Run1/outs/filtered_feature_bc_matrix' tools.name['Sfu'] = 'STAR fullSA' tools.mtxDir[ 'Sfu'] = mtxPrefix + 'STAR_2.7.8a/human_CR_3.0.0/fullSA/10X_CR4_noSAM/10X/3/pbmc_5k/20/b02/Solo.out/Gene/raw/' tools.name['Ssp'] = 'STAR sparseSA' tools.mtxDir[
from typing import Mapping import pytest from anndata import AnnData import scanpy as sc import numpy as np import pandas as pd from squidpy import gr, pl from tests.conftest import DPI, PlotTester, PlotTesterMeta C_KEY = "leiden" sc.pl.set_rcParams_defaults() sc.set_figure_params(dpi=40, color_map="viridis") # WARNING: # 1. all classes must both subclass PlotTester and use metaclass=PlotTesterMeta # 2. tests which produce a plot must be prefixed with `test_plot_` # 3. if the tolerance needs to be change, don't prefix the function with `test_plot_`, but with something else # the comp. function can be accessed as `self.compare(<your_filename>, tolerance=<your_tolerance>)` # ".png" is appended to <your_filename>, no need to set it class TestGraph(PlotTester, metaclass=PlotTesterMeta): def test_plot_interaction(self, adata: AnnData): gr.spatial_neighbors(adata) gr.interaction_matrix(adata, cluster_key=C_KEY) pl.interaction_matrix(adata, cluster_key=C_KEY)
import anndata as ad import episcanpy.api as epi import scanpy as sc import numpy as np import pandas as pd import re import pyranges as pr sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250, frameon=True, vector_friendly=True, color_map="YlGnBu", format='pdf', transparent=False, ipython_format='png2x') DATADIR = './EpiScanpy/' HBCx22 = epi.read_text("Matrices/HBCx_22.tsv", delimiter="\t",first_column_names="regions") Jurkat = epi.read_text("Matrices/Jurkat.tsv", delimiter="\t",first_column_names="regions") Ramos = epi.read_text("Matrices/Ramos.tsv", delimiter="\t",first_column_names="regions") MM468 = epi.read_text("Matrices/MM468.tsv", delimiter="\t",first_column_names="regions") MM468.var_names = "MM468_" + MM468.var_names HBCx22.var_names = "HBCx22_" + HBCx22.var_names Jurkat.var_names = "Jurkat_" + Jurkat.var_names Ramos.var_names = "Ramos_" + Ramos.var_names Ramos = Ramos.T HBCx22 = HBCx22.T Jurkat = Jurkat.T MM468 = MM468.T adata = MM468.concatenate(HBCx22, Jurkat, Ramos, join="inner", index_unique=None)
out='z') # 2. cluster sc.pp.neighbors(adata, n_neighbors=30, use_rep='latent') if args.cluster_method == 'leiden': sc.tl.leiden(adata) elif args.cluster_method == 'kmeans': kmeans = KMeans(n_clusters=k, n_init=20, random_state=0) adata.obs['kmeans'] = kmeans.fit_predict( adata.obsm['latent']).astype(str) # if args.reference in adata.obs: # cluster_report(adata.obs[args.reference].cat.codes, adata.obs[args.cluster_method].astype(int)) sc.settings.figdir = outdir sc.set_figure_params(dpi=80, figsize=(6, 6), fontsize=10) if args.embed == 'UMAP': sc.tl.umap(adata, min_dist=0.1) color = [ c for c in ['celltype', args.cluster_method] if c in adata.obs ] sc.pl.umap(adata, color=color, save='.pdf', wspace=0.4, ncols=4) elif args.embed == 'tSNE': sc.tl.tsne(adata, use_rep='latent') color = [ c for c in ['celltype', args.cluster_method] if c in adata.obs ] sc.pl.tsne(adata, color=color, save='.pdf', wspace=0.4, ncols=4) if args.impute: adata.obsm['impute'] = model.encodeBatch(testloader,
###SET DIRECTORY TO READ/WRITE DATA. #THIS SHOULD BE THE DIRECTORY CONTAINING THE .MTX DATA FILE AND .TSV BARCODES & FEATURE FILES: BaseDirectory = '/d2/studies/scanPy/VM_LHb_Stress/Ctrl_Stress_MergedScanPy/' sampleName = 'ACWS_VM_NDB_Stress' #This is used for name result output files os.chdir(BaseDirectory) %logstart -o scanpy_log.txt ###SET SCANPY SETTINGS: results_file='Results_File' #results_file = os.path.join(BaseDirectory, sampleName + '_scanpy_results.h5ad') # the file that will store the analysis results results_file_partial = os.path.join(BaseDirectory, sampleName + '_scanpy_adata_preHVGselection.h5ad') sc.settings.verbosity=3 # verbosity: errors (0), warnings (1), info (2), hints (3) sc.settings.n_jobs=8 #use parallel processing when possible sc.logging.print_header() sc.set_figure_params(fontsize=14, dpi=80, dpi_save=300, format='svg') matplotlib.rcParams.update({'text.usetex': False, 'font.family': 'stixgeneral', 'mathtext.fontset': 'stix',}) color_map='inferno' #see options for colormaps at https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html ###LOAD DATA if not new: try: sc.read(results_file) except FileNotFoundError: print("No full results file found. Attemping to open partial results file.") sc.read(results_file_partial) elif new: if dataType=='.h5': fileNames = glob.glob(os.path.join(BaseDirectory, '*filtered.h5')) if len(fileNames)>1: raise NameError("Multiple files matched glob pattern, check files or use more specific pattern")
sc.pp.neighbors(adata) sc.tl.leiden(adata) return adata.obs["leiden"] ############################################################################### # Then, we calculate feature clusters using different features and compare them to gene clusters: adata.obs["features_summary_cluster"] = cluster_features( adata.obsm["features"], like="summary") adata.obs["features_histogram_cluster"] = cluster_features( adata.obsm["features"], like="histogram") adata.obs["features_texture_cluster"] = cluster_features( adata.obsm["features"], like="texture") sc.set_figure_params(facecolor="white", figsize=(8, 8)) sc.pl.spatial( adata, color=[ "features_summary_cluster", "features_histogram_cluster", "features_texture_cluster", "cluster", ], ncols=3, ) ############################################################################### # Like the gene-space clusters (bottom middle), the feature space clusters are also spatially coherent. # # The feature clusters of the different feature extractors are quite diverse, but all of them reflect
import scanpy as sc from sklearn.dummy import DummyClassifier from sklearn.neural_network import MLPClassifier from sklearn.cluster import SpectralClustering, OPTICS, cluster_optics_dbscan, AgglomerativeClustering # settings plt.rc('font', size = 8) plt.rc('font', family='sans serif') plt.rcParams['pdf.fonttype']=42 plt.rcParams['ps.fonttype']=42 plt.rcParams['text.usetex']=True plt.rcParams['legend.frameon']=False plt.rcParams['axes.grid']=False plt.rcParams['legend.markerscale']=0.5 sc.set_figure_params(dpi=300,dpi_save=600, frameon=False, fontsize=8) plt.rcParams['savefig.dpi']=600 sc.settings.verbosity=2 sc._settings.ScanpyConfig.n_jobs=-1 # reproducibility rs = np.random.seed(42) # utils def mwu(X,Y,gene_names,correction=None,debug=False) : ''' Benjamini-Hochberg correction implemented. Can change to Bonferonni
def plot_sca(self, adata, sca_params, figdir='./figures/'): ''' See the Scanpy visualization library for examples ''' print("Plotting") ## Create my custom palette for FeaturePlots and define a matlplotlib colormap object if self.umap_feature_color == 'blue_orange': feature_colors = [(35, 35, 142), (255, 127, 0)] my_feature_cmap = self.make_cmap(feature_colors, bit=True) elif self.umap_feature_color == 'yellow_blue': feature_colors = [(210, 210, 210), (210, 210, 210), (245, 245, 200), (100, 200, 225), (0, 45, 125)] position = [0, 0.019999, 0.02, 0.55, 1] my_feature_cmap = self.make_cmap(feature_colors, bit=True, position=position) else: feature_colors = [(210, 210, 210), (210, 210, 210), (245, 245, 200), (100, 200, 225), (0, 45, 125)] position = [0, 0.019999, 0.02, 0.55, 1] my_feature_cmap = self.make_cmap(feature_colors, bit=True, position=position) gray_cmap = self.make_cmap([(220, 220, 220), (220, 220, 220)], bit=True) ## Check to see if user specified a color palette for categorical umap plots, ie. leiden, obs_fields if self.umap_categorical_color == 'default': ## Custom color palette for cluster plots and observation plots colors = [(1, 0.5, 0), (0.5, 0.5, 0.85), (0, 1, 0), (1, 0, 0), (0, 0, 0.9), (0, 1, 1), (0.4, 0.4, 0.4), (0.5, 0.85, 0.5), (0.5, 0.15, 0.5), (0.15, 0.5, 0.5), (0.5, 0.5, 0.15), (0.9, 0.9, 0), (1, 0, 1), (0, 0.5, 1), (0.85, 0.5, 0.5), (0.5, 1, 0), (0.5, 0, 1), (1, 0, 0.5), (0, 0.9, 0.6), (0.3, 0.6, 0), (0, 0.3, 0.6), (0.6, 0.3, 0), (0.3, 0, 0.6), (0, 0.6, 0.3), (0.6, 0, 0.3)] else: colors = self.umap_categorical_color ## General figure parameters and settings sc.set_figure_params(dpi_save=300, dpi=300) #,vector_friendly=False) sc.settings.figdir = figdir sc.set_figure_params(fontsize=12) size = self.size # Check to see if user wants publication quality figures if self.final_quality: # rcParams['figure.figsize'] = 4, 4 rcParams['savefig.dpi'] = 1200 file_type = '.pdf' else: file_type = '.png' ## Violin plots for filtering parameters pre and post sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save='_postFiltered_plot.png', show=False) if sca_params.adata_preQC: sc.pl.violin(sca_params.adata_preQC, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save='_preFiltered_plot.png', show=False) ## Draw the PCA elbow plot to determine which PCs to use sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, save='_elbowPlot.png', show=False) ## Ranks and displays most contributing genes for each principal component components = 4 loadings_components = range( sca_params.analysis_params.n_pcs - components, sca_params.analysis_params.n_pcs + components + 1) sc.pl.pca_loadings(adata, components=loadings_components, save='_rank_genes.png', show=False) ## Plot results of UMAP (and t-SNE) dimensional reduction and clustering for observation in self.umap_obs: legend = 'on data' if (observation == sca_params.analysis_params. clustering_choice) else 'right margin' sc.pl.umap(adata, color=observation, save=''.join(['_', observation, file_type]), show=False, legend_loc=legend, edges=False, size=size, palette=colors, alpha=0.75) if sca_params.analysis_params.do_tSNE: sc.pl.tsne(adata, color=observation, save=''.join(['_', observation, file_type]), show=False, legend_loc=legend, edges=False, size=size, palette=colors, alpha=0.75) # sc.external.pl.phate(adata,gene_symbols=['CAV1','LY6D','KRT4','TP63','CDH1'], use_raw=True, color_map=my_feature_cmap, # save='phate.png', size=size) ## Find marker genes via Wilxocon test based on cluster assignment # Create a simple plot to show the top 25 most significant markers for each cluster # Write most significant markers to a csv file for rank_grouping in self.rank_grouping: n_genes_rank = 5 for comparison in self.clusters2_compare: if comparison == 'all': comparison = None self.__rank_genes(adata, rank_grouping, figdir=figdir, clusters2_compare=comparison) if 'all' in self.clusters2_compare: sc.pl.rank_genes_groups_heatmap( adata, n_genes=n_genes_rank, use_raw=True, show=False, save=''.join(['_rank_heatmap_', rank_grouping, file_type]), cmap=my_feature_cmap) sc.pl.rank_genes_groups_dotplot( adata, n_genes=n_genes_rank, use_raw=True, show=False, save=''.join(['_rank_dotplot_', rank_grouping, file_type]), color_map=my_feature_cmap) sc.pl.rank_genes_groups_stacked_violin( adata, n_genes=n_genes_rank, use_raw=True, show=False, save=''.join(['_rank_violin_', rank_grouping, file_type])) ## Feature plots and dot plot analysis for each specified set of genes #sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_markerPlots.png', show=False) if sca_params.gene_lists: missing_genes = [] for gene_list in sca_params.gene_lists: gene_obj = sca_params.gene_dict[gene_list] genes_to_plot = [] [genes_to_plot, missing_genes ] = self.__find_genes(adata, gene_obj.markers, missing_genes=missing_genes) ## Do FeaturePlots for select genes print('Plotting standard marker genes: ', genes_to_plot, '\n') sc.pl.umap(adata, color=genes_to_plot, save=''.join( ['_featureplots_', gene_list, file_type]), show=False, cmap=my_feature_cmap, size=size, use_raw=True, vmin=0) if sca_params.analysis_params.do_tSNE: sc.pl.tsne(adata, color=genes_to_plot, save=''.join( ['_featureplots_', gene_list, file_type]), show=False, cmap=my_feature_cmap, size=size, use_raw=True, vmin=0) feature_positions = gene_obj.feature_positions # Manually set and determined feature_groups = gene_obj.feature_groups groupby_positions = gene_obj.groupby_positions if len(gene_obj.markers) != 0: for grouping in self.dot_grouping: ## Dotplot analysis # Circle color corresponds to expression level, and circle size corresponds to percentage of cells expressing gene ## Reordering categories for dotplot or heatmap rows adata_plots = adata.copy() dendrogram = False if groupby_positions: dendrogram = False clustering_chosen = sca_params.analysis_params.clustering_choice adata_plots.obs[clustering_chosen] = adata.obs[ clustering_chosen].cat.reorder_categories( groupby_positions, inplace=False) sc.pl.dotplot( adata_plots, genes_to_plot, groupby=grouping, var_group_positions=feature_positions, var_group_labels=feature_groups, save=''.join([ '_markers_', gene_list, '_', grouping, file_type ]), show=False, color_map=my_feature_cmap, use_raw=True, dendrogram=dendrogram ) #, figsize=(4,6))#, dot_max=0.4)#, dendrogram=True) ## Heatmaps # Each horizontal line represents expression of one cell sc.pl.heatmap(adata_plots, genes_to_plot, groupby=grouping, var_group_positions=feature_positions, var_group_labels=feature_groups, save=''.join([ '_markers_', gene_list, '_', grouping, file_type ]), show=False, cmap=my_feature_cmap, use_raw=True) # Genes that are not expressed or are invariable are plotted using a grayscale sca_params.missing_genes = missing_genes print('Plotting empty genes: ', missing_genes, '\n') empty_genes = [ gene for gene in missing_genes if (gene in adata.raw.var_names) ] genes_noseq = [ gene for gene in missing_genes if (gene not in empty_genes) ] print('Zero genes: ', empty_genes, '\n') print('Gene not in dataset: ', genes_noseq, '\n') if empty_genes: sc.pl.umap(adata, color=empty_genes, save=''.join(['_featureplots_gray', file_type]), show=False, cmap=gray_cmap, size=size, use_raw=True) # tSNE Plots - should move to integrate in umap code if sca_params.analysis_params.do_tSNE: sc.pl.tsne(adata, color=missing_genes, save=''.join(['_featureplots_gray', file_type]), show=False, cmap=gray_cmap, size=size, use_raw=True) if sca_params.qc_params.doublet_detection: sc.pl.umap(adata, color='doublet_labels', save='doublet_test.png', show=False, edges=False, size=size) f = doubletdetection.plot.convergence( sca_params.doublet_clf, save=''.join([figdir, 'convergence_test.pdf']), show=False, p_thresh=1e-16, voter_thresh=0.5) f3 = doubletdetection.plot.threshold( sca_params.doublet_clf, save=''.join([figdir, 'threshold_test.pdf']), show=False, p_step=6) # Generate a umap feature plot based on cell scoring if sca_params.cell_score_lists: max_list_len = max([ len(self.vmax_list), len(self.vmin_list), len(sca_params.cell_score_lists) ]) if not self.vmax_list: vmax = [ adata.obs.loc[:, sca_params.cell_score_lists].values.max() ] * max_list_len else: vmax = self.vmax_list if not self.vmin_list: vmin = [ adata.obs.loc[:, sca_params.cell_score_lists].values.min() ] * max_list_len else: vmin = self.vmin_list for i, score_name in enumerate(sca_params.cell_score_lists): sc.pl.umap(adata, color=score_name, save=''.join( ['_', score_name, '_cellType_score.png']), show=False, edges=False, color_map=my_feature_cmap, size=size, vmin=vmin[i], vmax=vmax[i]) sc.pl.umap(adata, color=score_name, save=''.join( ['_', score_name, '_cellType_score_0min.png']), show=False, edges=False, color_map=my_feature_cmap, size=size, vmin=0, vmax=vmax[i]) sc.pl.violin(adata, sca_params.cell_score_lists, groupby='sampleName', jitter=0.4, save='_cell_scores.png', show=False, multi_panel=True, rotation=90) if sca_params.analysis_params.dpt: sc.pl.diffmap(adata, color=[ 'dpt_pseudotime', sca_params.analysis_params.clustering_choice ], size=self.size, show=False, save=''.join( [sca_params.analysis_params.dpt[0], '.png'])) sc.pl.umap(adata, color='dpt_pseudotime', size=self.size, show=False, save=''.join([ '_', 'dpt', '_', sca_params.analysis_params.dpt[0], '.png' ])) # sc.pl.dpt_groups_pseudotime(adata, color_map=my_feature_cmap, # save=''.join([sca_params.analysis_params.dpt[0],sca_params.analysis_params.dpt[1],'.png'])) # sc.pl.dpt_timeseries(adata, color_map=my_feature_cmap, show=False, # save=''.join([sca_params.analysis_params.dpt[0],sca_params.analysis_params.dpt[1],'.png'])) # ## Violin plot for comparing gene expression among different groups/clusters # # Create observation field labeled using binary information # # Will have option integrated in pipeline in the future # adata.obs['CDH5_exp'] = ['CDH5+' if (cell!=0) else 'CDH5-' for cell in adata.raw[:,'CDH5'].X] # # Built in scanpy module # sc.pl.violin(adata, genes_to_plot+['CDH5'], groupby='CDH5_exp', jitter=True, # save='_feature.png', show=False, scale='width',use_raw=True) #order = ['CDH5+','CDH5-'], # Custom violin plot module -- Not complete/in testing df = pd.DataFrame() # Add Gaussian y-jitter to better visualize zero expression in violin plots for gene in genes_to_plot: sigma = np.amax(adata.raw[:, gene].X) / 40 gene_df = [ cell if (cell != 0) else np.random.normal(loc=0, scale=sigma) for cell in adata.raw[:, gene].X ] df[gene] = gene_df # df['CDH5_exp']=adata.obs['CDH5_exp'].values # vplot, axes = plt.subplots(math.ceil(len(genes_to_plot)/4),4, figsize=(18,12)) # plt.rcParams.update({'font.size':12}) # plt.subplots_adjust(left=0.125, right=0.9, bottom=0.1, top=0.9, wspace=0.4, hspace=0.4) # for i,gene in enumerate(genes_to_plot): # sns.violinplot(x='CDH5_exp', y=gene, data=df, inner=None, scale='width', ax=axes[math.floor(i/4),i%4]) # sns.stripplot(x='CDH5_exp', y=gene, data=df, jitter = True, color='black', size=0.4, ax=axes[math.floor(i/4),i%4]) # vplot.savefig(''.join([figdir,'/violin_feature_jitter.png'])) ## Scatter plots to identify clusters that are high in number of genes, UMI counts, and mito transcript fraction adata.obs['jitter'] = np.random.rand(len(adata.obs_names)) * 10 sc.pl.scatter(adata, x='jitter', y='n_genes', color=sca_params.analysis_params.clustering_choice, save='_n_genes.png', palette=colors, show=False) sc.pl.scatter(adata, x='jitter', y='n_counts', color=sca_params.analysis_params.clustering_choice, save='_n_counts.png', palette=colors, show=False) sc.pl.scatter(adata, x='jitter', y='percent_mito', color=sca_params.analysis_params.clustering_choice, save='_percent_mito.png', palette=colors, show=False) sc.pl.umap(adata, color=['n_genes', 'n_counts', 'percent_mito'], color_map=my_feature_cmap, save='_counts_check.png', show=False) # Set the thresholds and scaling factors for drawing the paga map/plot node_size_scale = 1.25 node_size_power = 0.9 edge_width_scale = 1 min_edge_width = 0.035 max_edge_width = 2 threshold = 0.08 sc.pl.paga(adata, layout='fr', threshold=threshold, node_size_scale=node_size_scale, node_size_power=node_size_power, edge_width_scale=edge_width_scale, min_edge_width=min_edge_width, max_edge_width=max_edge_width, show=False, save='_pagaPlot.png', title='PAGA: Fruchterman Reingold', frameon=False) return adata
def plot_sca(adata, sca_dict, adata_preFiltering=None, figdir='./figures', annotation_dict=None, summary_dict=None, adata_postPCA=None, final_quality=False): ''' See the Scanpy visualization library for examples ''' print("Plotting") ## Create my custom palette for FeaturePlots and define a matlplotlib colormap object feature_colors = [(35, 35, 142), (255, 127, 0)] blue_orange_cmap = make_cmap(feature_colors, bit=True) feature_colors = [(210, 210, 210), (210, 210, 210), (245, 245, 200), (100, 200, 225), (0, 45, 125)] position = [0, 0.019999, 0.02, 0.55, 1] my_feature_cmap = make_cmap(feature_colors, bit=True, position=position) gray_cmap = make_cmap([(220, 220, 220), (220, 220, 220)], bit=True) ## Custom color palette for cluster plots and observation plots colors = [(0.3, 0.3, 0.3), (1, 0, 0), (0, 1, 0), (0, 0, 0.9), (1, 0, 1), (0, 1, 1), (0.9, 0.9, 0), (0.85, 0.5, 0.5), (0.5, 0.85, 0.5), (0.5, 0.5, 0.85), (0.15, 0.5, 0.5), (0.5, 0.15, 0.5), (0.5, 0.5, 0.15), (1, 0.5, 0), (0, 0.5, 1), (0.5, 1, 0), (0.5, 0, 1), (1, 0, 0.5), (0, 1, 0.5)] ## General figure parameters and settings sc.set_figure_params(dpi_save=300, dpi=300) #,vector_friendly=False) sc.settings.figdir = figdir sc.set_figure_params(fontsize=12) size = sca_dict['plot_params']['size'] # Check to see if user wants publication quality figures if final_quality: rcParams['figure.figsize'] = 4, 4 rcParams['savefig.dpi'] = 1200 file_type = '.pdf' else: file_type = '.png' summary_dict.update(final_cell_count=len(adata.obs_names), final_gene_count=len(adata.var_names)) # ## Write a summary of the analysis to a text file including sample information and parameters write_summary(adata, sca_dict, annotation_dict, summary_dict) ## Violin plots for filtering parameters pre and post sc.pl.violin(adata_preFiltering, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save='_preFiltering_plot.png', show=False) sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save='_postFiltering_plot.png', show=False) ## Draw the PCA elbow plot to determine which PCs to use sc.pl.pca_variance_ratio(adata_postPCA, log=True, n_pcs=100, save='_elbowPlot.png', show=False) ## Ranks and displays most contributing genes for each principal component components = 4 loadings_components = range( sca_dict['analysis_params']['n_pcs'] - components, sca_dict['analysis_params']['n_pcs'] + components + 1) sc.pl.pca_loadings(adata_postPCA, components=loadings_components, save='_rank_genes.png', show=False) ## Plot results of UMAP dimensional reduction and clustering for observation in sca_dict['plot_params']['umap_obs']: legend = 'on data' if (observation == 'louvain') else 'right margin' sc.pl.umap(adata, color=observation, save=''.join(['_', observation, file_type]), show=False, legend_loc=legend, edges=False, size=size, palette=colors, alpha=0.75) ## Find marker genes via Wilxocon test based on Louvain cluster assignment # Create a simple plot to show the top 25 most significant markers for each cluster # Write most significant markers to a csv file # adata.obs['is_adult'] = ['Adult' if cell=='ND15989_Fresh_WT_Lung_Adult' else 'Fetal' for cell in adata.obs['sampleName']] # rank_grouping = 'age' # rank_genes(adata,rank_grouping,figdir=figdir)#,clusters2_compare=['1','4']) # sc.pl.rank_genes_groups_heatmap(adata, n_genes=100, use_raw=True, show=False, # save=''.join(['_rank_heatmap_',rank_grouping,file_type]), cmap=my_feature_cmap) # sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, use_raw=True, show=False, # save=''.join(['_rank_dotplot_',rank_grouping,file_type]), color_map=my_feature_cmap) # sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5, use_raw=True, # show=False, save=''.join(['_rank_violin_',rank_grouping,file_type])) rank_grouping = 'louvain' n_genes_rank = 25 rank_genes(adata, rank_grouping, figdir=figdir) #,clusters2_compare=['1','4']) sc.pl.rank_genes_groups_heatmap( adata, n_genes=n_genes_rank, use_raw=True, show=False, save=''.join(['_rank_heatmap_', rank_grouping, file_type]), cmap=my_feature_cmap) sc.pl.rank_genes_groups_dotplot( adata, n_genes=n_genes_rank, use_raw=True, show=False, save=''.join(['_rank_dotplot_', rank_grouping, file_type]), color_map=my_feature_cmap) sc.pl.rank_genes_groups_stacked_violin( adata, n_genes=n_genes_rank, use_raw=True, show=False, save=''.join(['_rank_violin_', rank_grouping, file_type])) ## Feature plots and dot plot analysis for each specified set of genes #sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_markerPlots.png', show=False) if sca_dict['gene_lists']: missing_genes = [] for gene_list in sca_dict['gene_lists']: gene_dict = sca_dict[gene_list] genes_to_plot = [] [genes_to_plot, missing_genes] = self.__find_genes(adata, gene_dict['markers'], missing_genes=missing_genes) ## Do FeaturePlots for select genes print('Plotting standard marker genes: ', genes_to_plot, '\n') sc.pl.umap(adata, color=genes_to_plot, save=''.join(['_featureplots_', gene_list, file_type]), show=False, cmap=my_feature_cmap, size=size, use_raw=True) if gene_dict['positions'] and gene_dict['groups']: group_positions = gene_dict[ 'positions'] # Manually set and determined group_labels = gene_dict['groups'] else: group_positions = None group_labels = None if len(gene_dict['markers']) != 1: for grouping in sca_dict['plot_params']['exp_grouping']: ## Dotplot analysis # Circle color corresponds to expression level, and circle size corresponds to percentage of cells expressing gene ## Reordering categories for dotplot or heatmap rows #adata_temp = adata.copy() #adata_temp.obs['louvain'] = adata.obs['louvain'].cat.reorder_categories(['3','5','0','4','2','1'],inplace = False) sc.pl.dotplot(adata, genes_to_plot, groupby=grouping, var_group_positions=group_positions, var_group_labels=group_labels, save=''.join([ '_markers_', gene_list, '_', grouping, file_type ]), show=False, color_map=my_feature_cmap, use_raw=True, dot_max=0.4) ## Heatmaps # Each horizontal line represents expression of one cell sc.pl.heatmap(adata, genes_to_plot, groupby=grouping, var_group_positions=group_positions, var_group_labels=group_labels, save=''.join([ '_markers_', gene_list, '_', grouping, file_type ]), show=False, cmap=my_feature_cmap, use_raw=True) # Genes that are not expressed or are invariable are plotted using a grayscale print('Plotting empty genes: ', missing_genes, '\n') sc.pl.umap(adata, color=missing_genes, save=''.join(['_featureplots_gray', file_type]), show=False, cmap=gray_cmap, size=size, use_raw=True) ## tSNE Plots # sc.pl.tsne(adata, color='louvain', save = '_clusterIdentity.png', show = False, # legend_loc = 'right margin', edges = False, size = 20, # palette = colors, alpha = 0.75) # sc.pl.tsne(adata, color='sampleName', save = '_sample.png', show = False, # legend_loc = 'right margin', edges = False, size = 20, # palette = colors, alpha = 0.75) # sc.pl.tsne(adata, color=genes_to_plot, save = '_featureplots.png', show = False, cmap = my_feature_cmap, size = 25, use_raw = True) # sc.pl.tsne(adata, color=missing_genes, save='_featureplots_gray.png', show=False, cmap=gray_cmap, size=20, use_raw=True) # ## Violin plot for comparing gene expression among different groups/clusters # # Create observation field labeled using binary information # # Will have option integrated in pipeline in the future # adata.obs['CDH5_exp'] = ['CDH5+' if (cell!=0) else 'CDH5-' for cell in adata.raw[:,'CDH5'].X] # # Built in scanpy module # sc.pl.violin(adata, genes_to_plot+['CDH5'], groupby='CDH5_exp', jitter=True, # save='_feature.png', show=False, scale='width',use_raw=True) #order = ['CDH5+','CDH5-'], # Custom violin plot module -- Not complete/in testing df = pd.DataFrame() # Add Gaussian y-jitter to better visualize zero expression in violin plots for gene in genes_to_plot: sigma = np.amax(adata.raw[:, gene].X) / 40 gene_df = [ cell if (cell != 0) else np.random.normal(loc=0, scale=sigma) for cell in adata.raw[:, gene].X ] df[gene] = gene_df # df['CDH5_exp']=adata.obs['CDH5_exp'].values # vplot, axes = plt.subplots(math.ceil(len(genes_to_plot)/4),4, figsize=(18,12)) # plt.rcParams.update({'font.size':12}) # plt.subplots_adjust(left=0.125, right=0.9, bottom=0.1, top=0.9, wspace=0.4, hspace=0.4) # for i,gene in enumerate(genes_to_plot): # sns.violinplot(x='CDH5_exp', y=gene, data=df, inner=None, scale='width', ax=axes[math.floor(i/4),i%4]) # sns.stripplot(x='CDH5_exp', y=gene, data=df, jitter = True, color='black', size=0.4, ax=axes[math.floor(i/4),i%4]) # vplot.savefig(''.join([figdir,'/violin_feature_jitter.png'])) ## Scatter plots to identify clusters that are high in adata.obs['jitter'] = np.random.rand(len(adata.obs_names)) * 10 sc.pl.scatter(adata, x='jitter', y='n_genes', color='louvain', save='_n_genes.png', palette=colors, show=False) sc.pl.scatter(adata, x='jitter', y='n_counts', color='louvain', save='_n_counts.png', palette=colors, show=False) sc.pl.scatter(adata, x='jitter', y='percent_mito', color='louvain', save='_percent_mito.png', palette=colors, show=False) # Set the thresholds and scaling factors for drawing the paga map/plot node_size_scale = 1.25 node_size_power = 0.9 edge_width_scale = 1 min_edge_width = 0.035 max_edge_width = 2 threshold = 0.08 # Draw the actual plot sc.pl.paga(adata, layout='fr', threshold=threshold, node_size_scale=node_size_scale, node_size_power=node_size_power, edge_width_scale=edge_width_scale, min_edge_width=min_edge_width, max_edge_width=max_edge_width, show=False, save='_pagaPlot.png', title='PAGA: Fruchterman Reingold', frameon=False) print("\nAll done!\n") return adata
Spyder Editor This is a temporary script file. """ %reset import numpy as np import pandas as pd import os import scanpy as sc import seaborn as sns from plotnine import * path = '/Users/kj22643/Documents/Documents/231_Classifier_Project/data' #path = '/stor/scratch/Brock/231_10X_data/' os.chdir(path) sc.settings.figdir = 'EB_plots' sc.set_figure_params(dpi_save=300) sc.settings.verbosity = 3 #%% #read in 10X data to create anndata object called "adata" adata = sc.read_10x_mtx('agg_all/outs/filtered_feature_bc_matrix', cache=True).copy() #%% #add lineage and sample info for each cell df = pd.read_csv('lineage_analysis/10x_231.lineage_assignment.clustered_lineage.singletons_only.tsv',sep='\t') df = df.rename(columns={'clustered_lineage':'lineage'}) df.set_index('proper_cell_name',inplace=True) adata.obs = adata.obs.join(df) dic = {'1':'Doxneg','2':'Doxpos','3':'107Aziz','4':'113Aziz'} adata.obs['sample'] = adata.obs.index.str[-1].map(dic) adata.obs['sample'] = adata.obs['sample'].astype('category').cat.reorder_categories(['Doxneg','Doxpos','107Aziz','113Aziz'])
import numpy as np import pandas as pd import scipy import seaborn as sns from sklearn.preprocessing import LabelEncoder # from sklearn.metrics import silhouette_score from peppy import Project # from ngs_toolkit.analysis import Analysis import scanpy as sc # from dca.utils import plot_mean_dropout from natsort import natsorted as sorted sc.set_figure_params(format="svg", dpi_save=300, vector_friendly=True) sns.set_style("white") plt.rcParams['svg.fonttype'] = 'none' # random seed SEED = int("".join(LabelEncoder().fit(list(string.ascii_uppercase)).transform( list("BOCKLAB")).astype(str))) random.seed(SEED) np.random.seed(SEED) # Set settings pd.set_option("date_dayfirst", True) sns.set(context="paper", style="white", palette="pastel", color_codes=True) sns.set_palette(sns.color_palette("colorblind")) matplotlib.rcParams["svg.fonttype"] = "none" matplotlib.rc('text', usetex=False)
def setFigureOpt(opt): sc.set_figure_params(dpi_save=int(opt['dpi']),fontsize= float(opt['fontsize']),vector_friendly=(opt['vectorFriendly'] == 'Yes'),transparent=(opt['transparent'] == 'Yes'),color_map=opt['colorMap']) rcParams.update({'savefig.format':opt['img']})
) except (ValueError, IndexError): # This tends to fail with a batch key... if it does re-do it without a batch key. sc.pp.highly_variable_genes( adata_scran, flavor="cell_ranger", n_top_genes=5000, batch_key=None ) sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver="arpack") sc.pp.neighbors(adata) sc.tl.umap(adata) sc.tl.leiden(adata) markers = { "CD8A", "CD4", "FOXP3", "phase", "leiden", "sample", "origin", "tissue", "condition", } & (set(adata.var_names) | set(adata.obs.columns)) sc.set_figure_params(figsize=(5,5)) sc.pl.umap(adata, color=markers, ncols=3) adata.write_h5ad(output_adata, compression="lzf")
import numpy as np import scanpy as sc import torch import anndata import matplotlib.pyplot as plt from typing import Union from scarches.dataset.trvae._utils import label_encoder from scarches.metrics.metrics import entropy_batch_mixing, knn_purity, asw, nmi from scarches.models import trVAE, TRVAE from scarches.trainers import trVAETrainer sc.settings.set_figure_params(dpi=200, frameon=False) sc.set_figure_params(dpi=200) torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7) np.set_printoptions(precision=2, edgeitems=7) class TRVAE_EVAL: def __init__( self, model: Union[trVAE, TRVAE], adata: anndata.AnnData, trainer: trVAETrainer = None, condition_key: str = None, cell_type_key: str = None ): if type(model) is TRVAE: trainer = model.trainer model = model.model
from python_scripts.spatial_correlation import helper_functions # Plotting packages import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap # System specific import os # Calculation packages import scanpy as sc import numpy as np # Figure params sc.set_figure_params(color_map='viridis') fig_size, title_fontsize, axis_label_fontsize, legend_fontsize, fileformat, img_key, xy_ticks, text_fontsize = \ helper_functions.figure_params() def plot_standalone_colorbar(tissuecomb_colors, labels, save_folder): """Plot a standalone vertical and horizontal Colorbar Parameters ---------- tissuecomb_colors : matplotlib.colors.ListedColormap labels : list of str save_folder : str Returns -------