def test_results_sparse(): seed(1234) adata = get_example_data(sparse=True) true_names_t_test, true_names_wilcoxon,\ true_scores_t_test, true_scores_wilcoxon = get_true_scores() rank_genes_groups(adata, 'true_groups', n_genes=20, method='t-test') adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][ 'names'].astype(true_names_t_test.dtype) for name in true_scores_t_test.dtype.names: assert np.allclose(true_scores_t_test[name], adata.uns['rank_genes_groups']['scores'][name]) assert np.array_equal(true_names_t_test, adata.uns['rank_genes_groups']['names']) rank_genes_groups(adata, 'true_groups', n_genes=20, method='wilcoxon') adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][ 'names'].astype(true_names_wilcoxon.dtype) for name in true_scores_t_test.dtype.names: assert np.allclose(true_scores_wilcoxon[name][:7], adata.uns['rank_genes_groups']['scores'][name][:7]) assert np.array_equal(true_names_wilcoxon[:7], adata.uns['rank_genes_groups']['names'][:7])
def test_wilcoxon_symmetry(): pbmc = pbmc68k_reduced() rank_genes_groups( pbmc, groupby="bulk_labels", groups=["CD14+ Monocyte", "Dendritic"], reference="Dendritic", method='wilcoxon', rankby_abs=True, ) assert pbmc.uns["rank_genes_groups"]["params"]["use_raw"] is True stats_mono = (rank_genes_groups_df( pbmc, group="CD14+ Monocyte").drop(columns="names").to_numpy()) rank_genes_groups( pbmc, groupby="bulk_labels", groups=["CD14+ Monocyte", "Dendritic"], reference="CD14+ Monocyte", method='wilcoxon', rankby_abs=True, ) stats_dend = (rank_genes_groups_df( pbmc, group="Dendritic").drop(columns="names").to_numpy()) assert np.allclose(np.abs(stats_mono), np.abs(stats_dend))
def test_emptycat(): pbmc = pbmc68k_reduced() pbmc.obs['louvain'] = pbmc.obs['louvain'].cat.add_categories(['11']) with pytest.raises(ValueError, match=rf"Could not calculate statistics.*{'11'}"): rank_genes_groups(pbmc, groupby='louvain')
def get_de(adata, mygroup, demethod='wilcoxon', topnr=5000, logfc=1, padj=0.05): """ Get a table of significant DE genes at certain cutoffs Based on an AnnData object and an annotation category (e.g. louvain) runs scanpy's rank_genes_groups using a specified method with specified cutoffs (nr. genes, logfc, padj) and returns a df with the results parameters ---------- adata: `AnnData` AnnData object containing mygroup: `str` group for performing de, needs be in adata.obs demethod: `str` one of 't-test', 'wilcoxon', 't-test_overestim_var', 'logreg' topnr: `int` the number of top genes in the DE analysis padj: `float` log fold-change cutoff logfc: `float` adjusted p-value cutoff returns ------- delist a list of panda DataFrames of differentially expressed genes """ try: x = adata.obs[mygroup] except KeyError: print( "Oops! The adata object does not have the specified column. Options are: " ) print(list(adata.obs.columns)) return mygroups = list(sort(list(set(adata.obs[mygroup])))) delist = {} rank_genes_groups(adata, groupby=mygroup, use_raw=True, n_genes=adata.raw.X.shape[1], method=demethod) for i in mygroups: df = DataFrame(adata.uns['rank_genes_groups']['names']).head(topnr)[i] dfS = DataFrame( adata.uns['rank_genes_groups']['scores']).head(topnr)[i] dfFC = DataFrame( adata.uns['rank_genes_groups']['logfoldchanges']).head(topnr)[i] dfp = DataFrame( adata.uns['rank_genes_groups']['pvals_adj']).head(topnr)[i] d = concat([df, dfS, dfFC, dfp], axis=1) d.columns = ['Name', 'Score', 'Log2FC', 'P.adj'] delist[i] = d[(d['Log2FC'] >= logfc) & (d['P.adj'] <= padj)] return (delist)
def test_results_layers(): seed(1234) adata = get_example_data(sparse=False) adata.layers["to_test"] = adata.X.copy() adata.X = adata.X * np.random.randint(0, 2, adata.shape, dtype=bool) ( true_names_t_test, true_names_wilcoxon, true_scores_t_test, true_scores_wilcoxon, ) = get_true_scores() # Wilcoxon rank_genes_groups( adata, 'true_groups', method='wilcoxon', layer="to_test", n_genes=20, ) assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False for name in true_scores_t_test.dtype.names: assert np.allclose( true_scores_wilcoxon[name][:7], adata.uns['rank_genes_groups']['scores'][name][:7], ) rank_genes_groups(adata, 'true_groups', method='wilcoxon', n_genes=20) for name in true_scores_t_test.dtype.names: assert not np.allclose( true_scores_wilcoxon[name][:7], adata.uns['rank_genes_groups']['scores'][name][:7], ) # t-test rank_genes_groups( adata, 'true_groups', method='t-test', layer="to_test", use_raw=False, n_genes=20, ) for name in true_scores_t_test.dtype.names: assert np.allclose( true_scores_t_test[name][:7], adata.uns['rank_genes_groups']['scores'][name][:7], ) rank_genes_groups(adata, 'true_groups', method='t-test', n_genes=20) for name in true_scores_t_test.dtype.names: assert not np.allclose( true_scores_t_test[name][:7], adata.uns['rank_genes_groups']['scores'][name][:7], )
def test_results_dense(): seed(1234) adata = get_example_data() assert adata.raw is None # Assumption for later checks ( true_names_t_test, true_names_wilcoxon, true_scores_t_test, true_scores_wilcoxon, ) = get_true_scores() rank_genes_groups(adata, 'true_groups', n_genes=20, method='t-test') adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][ 'names'].astype(true_names_t_test.dtype) for name in true_scores_t_test.dtype.names: assert np.allclose(true_scores_t_test[name], adata.uns['rank_genes_groups']['scores'][name]) assert np.array_equal(true_names_t_test, adata.uns['rank_genes_groups']['names']) assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False rank_genes_groups(adata, 'true_groups', n_genes=20, method='wilcoxon') adata.uns['rank_genes_groups']['names'] = adata.uns['rank_genes_groups'][ 'names'].astype(true_names_wilcoxon.dtype) for name in true_scores_t_test.dtype.names: assert np.allclose( true_scores_wilcoxon[name][:7], adata.uns['rank_genes_groups']['scores'][name][:7], ) assert np.array_equal(true_names_wilcoxon[:7], adata.uns['rank_genes_groups']['names'][:7]) assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False
def test_filter_rank_genes_groups(): adata = pbmc68k_reduced() # fix filter defaults args = { 'adata': adata, 'key_added': 'rank_genes_groups_filtered', 'min_in_group_fraction': 0.25, 'min_fold_change': 1, 'max_out_group_fraction': 0.5, } rank_genes_groups(adata, 'bulk_labels', reference='Dendritic', method='wilcoxon', n_genes=5) filter_rank_genes_groups(**args) assert np.array_equal( names_reference, np.array(adata.uns['rank_genes_groups_filtered']['names'].tolist()), ) rank_genes_groups(adata, 'bulk_labels', method='wilcoxon', n_genes=5) filter_rank_genes_groups(**args) assert np.array_equal( names_no_reference, np.array(adata.uns['rank_genes_groups_filtered']['names'].tolist()), ) rank_genes_groups(adata, 'bulk_labels', method='wilcoxon', pts=True, n_genes=5) filter_rank_genes_groups(**args) assert np.array_equal( names_no_reference, np.array(adata.uns['rank_genes_groups_filtered']['names'].tolist()), )
def celltype_labeling(adata, labeling_author, results_folder, labeling_to_use='celltype', labeling_name='celltype', labeling_description='manual celltype annotation', cluster_method='louvain'): """ Standard Workflow function to export an additional labeling besides louvain to FAIR format. This function calculated marker genes per label (using rank_genes_groups and the method 'wilcoxon'), exports the labeling, generates a labeling_info file, and exports the rank file. parameters ---------- adata: `AnnData` AnnData object from which the labeling is to be exported labeling_to_use: `str` | default = 'celltype' string identifying the column in adata.obs containing the labeling that is to be exported (also used to calculate the ranked_genes) labeling_name: `str` | default = 'celltype' string identifiying under which name the labeling should be exported labeling_description: `str` | default = 'manual celltype annotation' string defining the description which should be saved in the labeling_info file for the exported labeling labeling_author: `str` string defining the author of the labeling which should be saved in the labeling_info file for the exported labeling results_folder: `str` string indicating the basepath to the results folder which is automatically generated when using the standard workflow (pass results_folder) returns ------- None writes out several files to folder results_folder/labelings/<labeling_name> """ start = time() # calculate marker genes for labeling rank_genes_groups(adata, labeling_to_use, method='wilcoxon', use_raw=True, n_genes=adata.raw.X.shape[1]) print('rank genes per label calculated using method wilcoxon.') logging.info( 'Marker gene detection performed on a per-label basis using the method wilcoxon.' ) logging.info('\tTime for marker gene detection: ' + str(round(time() - start, 3)) + 's') # export labeling outpath = os.path.join(results_folder, 'labelings', labeling_name) start = time() labeling(adata, column=labeling_to_use, outpath=outpath) # generate labelinfo.tsv file labeling_info(outpath=outpath, description=labeling_description, public=False, default=False, expert=True, reference=False, method=labeling_author, annotated_version_of=cluster_method) export_rank(adata, basepath=results_folder, type='wilcox', labeling_name=labeling_name) logging.info('Label level analysis and marker genes exported to file.') logging.info('\tTime for export of cluster level analysis: ' + str(round(time() - start, 3)) + 's') return (None)
def clustering(adata, results_folder, myres=1, method='leiden'): """ Perform adata clustering and write the corresponding results parameters ---------- adata: `Ă€nnData` AnnData object that is to be exported results_folder: `str` path to the results folder myres: int resolution for the algorithm method: `str` clustering algorithm. Implemented: louvain/leiden returns ------- None writes to file """ if (not method in ['leiden', 'louvain']): raise ValueError("method argument should be leiden or louvain") random_state = 0 start = time() if method == 'louvain': sc_louvain(adata, resolution=myres, random_state=random_state) if method == 'leiden': sc_leiden(adata, resolution=myres, random_state=random_state) print(method + ' clustering performed with a resolution of ' + str(myres)) clusNum = len(set(adata.obs[method])) sc_pl_umap(adata, color=[method], legend_loc='on data', save='.' + method + '.png') logging.info(method + 'clustering done. Found ' + str(clusNum) + ' clusters.') logging.info('\tTime for ' + method + ' clustering: ' + str(round(time() - start, 3)) + 's') # detect marker genes start = time() rank_genes_groups(adata, method, method='wilcoxon', use_raw=True, n_genes=adata.raw.X.shape[1]) print('rank genes per cluster calculated using method wilcoxon.') logging.info( 'Marker gene detection performed on a per-cluster basis using the method wilcoxon.' ) logging.info('\tTime for marker gene detection: ' + str(round(time() - start, 3)) + 's') # export clustering to file start = time() export_clustering(adata, basepath=results_folder, method=method) export_rank(adata, basepath=results_folder, type='wilcox', labeling_name=method) logging.info('Cluster level analysis and marker genes exported to file.') logging.info('\tTime for export of cluster level analysis: ' + str(round(time() - start, 3)) + 's') return (adata)
def perform_dge(adata, design_matrix, differentiating_criteria, constant_criteria, basepath, min_cells_per_group=30, method='wilcoxon'): """Perform differential gene expression between two conditions over many adata subsets. This function automatically generates top_tables and rank_files for a list of comparisons in a dataset. The comparison you wish to perform need to be identified in a so called design matrix (see below). This function is capable of handling comparisons where you wish to compare two conditions in a subset of the dataset, e.g. treatment vs control in the celltype CD4 T-cell. The conditions must be annotated in a column adata.obs, this represents the differentiating_criteria. This column may only have two different labels! The subsets in which this comparison should be made must be annotated in another column represented by 'constant_criteria'. This column may have as many labels as you wish. Design Matrix: the design matrix consists of a pandas.DataFrame with two columns. Each row represents one comparison that is to be made. The first column labeled 'Group1', contains a tuple identifying the first group for that comparison and the second column labeled 'Group2' contains a tuple identifying the second group for the comparison. The tuple has the form (differentiating_criteria, constant_criteria). >>> #example of a Design Matrix >>> celltypes = ['CD4 T-cell', 'CD8 T-cell', 'B-cell', 'myeloid cell'] >>> design_matrix = pd.DataFrame({'Group1':[('PBMC', celltype) for celltype in celltypes], 'Group2':[('Skin', celltype) for celltype in celltypes]}) parameters ---------- adata: `AnnData` AnnData object containing design_matrix: `pandas.DataFrame` pandas.DataFrame containing all the comparisons that are to be made. method: `str` one of 't-test', 'wilcoxon', 't-test_overestim_var', 'logreg' """ #get raw data adata_raw = get_raw(adata=adata) too_few_cells = [] for i in range(design_matrix.shape[0]): #get group1 and group2 group1 = design_matrix.values[i, 0] group2 = design_matrix.values[i, 1] contrast = str(group1[1] + '_' + group1[0] + '_vs_' + group2[1] + '_' + group2[0]) contrast = contrast.replace(' ', '_') contrast = contrast.replace('-', '_') contrast = contrast.replace('+', '') #perform sanitychecks if group1[1] != group2[1]: #sys.exit() print('please ensure design matrix is correct!') #get adata_subset adata_subset = subset_adata( adata_raw, filter_criteria=adata.obs.get(constant_criteria) == group1[1], raw=False) adata_subset.var = adata_raw.var #ensure that you have enough cells in each group counts = adata_subset.obs.get(differentiating_criteria).value_counts() if counts.get(group1[0]) is None: print('----------------------------------------') print('not enough cells for comparison', contrast, ' in one of the groups') too_few_cells.append(i) elif counts.get(group2[0]) is None: print('----------------------------------------') print('not enough cells for comparison', contrast, ' in one of the groups') too_few_cells.append(i) elif not ((counts.get(group1[0]) > min_cells_per_group) & (counts.get(group2[0]) > min_cells_per_group)): print('----------------------------------------') print('not enough cells for comparison', contrast, ' in one of the groups') too_few_cells.append(i) else: #caclulate rank_genes print('----------------------------------------') print('performing comparison', contrast) rank_genes_groups(adata=adata_subset, groupby=differentiating_criteria, reference=group2[0], groups=[group1[0]], method=method, n_genes=adata_subset.n_vars) scores, pvalues, logFC, FDRs = extract_info_rank_genes_groups( adata_subset) #calculate mean expression of each gene in the data mean_all = adata_subset.X.todense().mean(axis=0).tolist()[0] mean_group1 = adata_subset[adata_subset.obs.get( differentiating_criteria) == group1[0]].X.todense().mean( axis=0).tolist()[0] mean_group2 = adata_subset[adata_subset.obs.get( differentiating_criteria) == group2[0]].X.todense().mean( axis=0).tolist()[0] #get mean expression log2cp10k = DataFrame(data={ 'log2cp10k': mean_all, 'log2cp10k_' + group1[0]: mean_group1, 'log2cp10k_' + group2[0]: mean_group2 }, index=adata_subset.var.ENSEMBL) #generate top_table for group1 top_table = DataFrame(data={ 'Contrast': contrast, 'SYMBOL': scores.Description, 'ENSEMBL': scores.index.tolist(), 'Score': scores.get(group1[0]) }, index=scores.index) #add p-value, logFC, FDRs, mean_expression top_table = top_table.merge(pvalues.get( group1[0]).to_frame().rename(columns={group1[0]: 'P-value'}), how='left', right_index=True, left_index=True) top_table = top_table.merge(logFC.get( group1[0]).to_frame().rename(columns={group1[0]: 'logFC'}), how='left', right_index=True, left_index=True) top_table = top_table.merge(FDRs.get( group1[0]).to_frame().rename(columns={group1[0]: 'FDR'}), how='left', right_index=True, left_index=True) top_table = top_table.merge(log2cp10k, how='left', right_index=True, left_index=True) #order according to score top_table.sort_values(ascending=False, axis=0, by='Score', inplace=True) #replace all 0 values with the smallest number possible in python smallest_num = 1e-308 top_table.replace(0, 1e-308, inplace=True) #generate rank files rank_file = top_table.get(['SYMBOL', 'P-value', 'logFC']) rank_file['value'] = abs(log10(rank_file['P-value'])) * sign( rank_file['logFC']) rank_file.drop(columns=['P-value', 'logFC'], inplace=True) rank_file.sort_values(ascending=False, axis=0, by='value', inplace=True) #replace all inf values with very small or very large values rank_file.replace(inf, 1e100, inplace=True) rank_file.replace(-inf, -1e100, inplace=True) outpath = os.path.join(basepath) if not os.path.exists(outpath): os.makedirs(outpath) if method == 'wilcoxon': rank_File = os.path.join(outpath, "WilxRank_" + contrast + ".rnk") TopTable_File = os.path.join( outpath, "WilxTopTable_" + contrast + ".txt") elif method == 't-test_overestim_var': rank_File = os.path.join(outpath, "OverestRank_" + contrast + ".rnk") TopTable_File = os.path.join( outpath, "OverestTopTable_" + contrast + ".txt") elif method == 't-test': rank_File = os.path.join(outpath, "tTestRank_" + contrast + ".rnk") TopTable_File = os.path.join( outpath, "tTestTopTable_" + contrast + ".txt") elif method == 'logreg': rank_File = os.path.join(outpath, "logregRank_" + contrast + ".rnk") TopTable_File = os.path.join( outpath, "logregTopTable_" + contrast + ".txt") else: sys.exit( 'need to specify type as one of \'wilcoxon\' or \'t-test_overestim_var\' or \'t-test\' or \'logreg\'' ) #write out rankfile rank_file.to_csv(rank_File, sep='\t', index=False, header=False, float_format='%.2f') print(rank_File, 'written out') #write out TopTable top_table.to_csv(TopTable_File, sep='\t', index=True, header=True, float_format='%1.3e') print(TopTable_File, 'written out') #return table containing the comparisions that could not be performed if len(too_few_cells) > 0: print('') print('') print('----------------------------------------------') print( 'the following comparisons from the design_matrix could not per performed due to too few cells' ) display(design_matrix.iloc[too_few_cells, :]) return (None) sys.exit(0)