def cell_cycle_correction(data, annoFile, annoDB = 'custom'): if not os.path.exists(annoFile): raise Exception("Annotation file not found") data_slalom = load_txt(df=data.T,annoFiles=annoFile,annoDBs=annoDB) print ("Loaded {:d} cells, {:d} genes".format(data_slalom['Y'].shape[0],data_slalom['Y'].shape[1])) print ("Annotation: {:d} terms".format(len(data_slalom['terms']))) #I: indicator matrix that assigns genes to pathways I = data_slalom['I'] #if loaded from the hdf file change to I = data['IMSigDB'] #Y: log expresison values Y = data_slalom['Y'] #terms: ther names of the terms terms = data_slalom['terms'] #gene_ids: the ids of the genes in Y gene_ids = data_slalom['genes'] #initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors FA = slalom.initFA(Y, terms,I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=10) #model training FA.train() #print diagnostics FA.printDiagnostics() corrected_data = FA.regressOut(terms=['M phase', 'Dna replication', 'Chromosome segregation','M phase of mitotic cell cycle']) full_matrix = data.copy() annotated_genes = np.array(data_slalom['genes'])[np.sum(data_slalom['I'], axis=1) != 0] full_matrix[annotated_genes] = corrected_data return full_matrix
def factor_analysis(norm_df, id_to_name, tsne, annoFile, annoDB): df_factors = None factors = [] if not os.path.exists(annoFile): raise Exception("Annotation file not found") norm_df_copy = norm_df.copy() norm_df_copy.columns = np.array( [id_to_name[gene_id] for gene_id in norm_df_copy.columns.values]) data = load_txt_2(data=norm_df_copy.T, annoFiles=annoFile, annoDBs=annoDB) print("Loaded {:d} cells, {:d} genes".format(data['Y'].shape[0], data['Y'].shape[1])) print("Annotation: {:d} terms".format(len(data['terms']))) #I: indicator matrix that assigns genes to pathways I = data['I'] #if loaded from the hdf file change to I = data['IMSigDB'] #Y: log expresison values Y = data['Y'] #terms: ther names of the terms terms = data['terms'] #gene_ids: the ids of the genes in Y gene_ids = data['genes'] #initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors FA = slalom.initFA(Y, terms, I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=15) #model training FA.train() #print diagnostics FA.printDiagnostics() #plot results #fig = plotRelevance(FA) print(terms[0]) X = FA.getX(terms=[terms[0]]) if True: ## is the first factor df_factors = pd.DataFrame(X, columns=[terms[0]]) df_factors.index = tsne.index else: df_factors[terms[0]] = X factors.append(terms[0]) #print(df_factors) plot_gene_expression(df_factors, tsne, factors) plt.show() # plt.savefig(os.path.join(out_dir,'factors.pdf')) #get factors; analogous getters are implemented for relevance and weights (see docs) #X = FA.getX(terms=['G2m checkpoint','P53 pathway']) print(FA.getX()) print(FA.getX().shape)
import time import slalom as sl import numpy as np from anndata import read_h5ad print('reading') ad = read_h5ad('kang_count.h5ad') print('initializing') t0 = time.time() FA = sl.initFA(ad.X, ad.uns['terms'], ad.varm['I'], list(ad.var_names), noise='gauss', nHidden=3, nHiddenSparse=0, minGenes=12, pruneGenes=True, do_preTrain=False) print('training') FA.train() t1 = time.time() print(t1 - t0) sl.saveFA(FA, 'FA_kang_3_hidden.hdf5', saveF=True)
print ("Annotation: {:d} terms".format(len(data['terms']))) #I: indicator matrix that assigns genes to pathways I = data['I'] #if loaded from the hdf file change to I = data['IMSigDB'] #Y: log expresison values Y = data['Y'] #terms: ther names of the terms terms = data['terms'] #gene_ids: the ids of the genes in Y # gene_ids = data['genes'] # Incorrect key gene_ids = data['sym_names'] #initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors print('Initialising') FA = slalom.initFA(Y, terms, I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=15) #model training print('Training') FA.train() #print diagnostics FA.printDiagnostics() #plot results fig = plotRelevance(FA) #get factors; analogous getters are implemented for relevance and weights (see docs) # X = FA.getX(terms=['G2m checkpoint','P53 pathway']) # Incorrect names terms_to_plot = ['G2M_CHECKPOINT','P53_PATHWAY'] X = FA.getX(terms=terms_to_plot)