def test_solver(): # Testing exact vs approximate solver magic_op = magic.MAGIC(t="auto", decay=20, knn=10, solver="exact", verbose=False, random_state=42) data_imputed_exact = magic_op.fit_transform(scdata_norm) # should have exactly as many genes stored assert magic_op.X_magic.shape[1] == scdata_norm.shape[1] # should be nonzero assert np.all(data_imputed_exact >= 0) magic_op = magic.MAGIC( t="auto", decay=20, knn=10, n_pca=150, solver="approximate", verbose=False, random_state=42, ) # magic_op.set_params(solver='approximate') data_imputed_apprx = magic_op.fit_transform(scdata_norm) # should have n_pca genes stored assert magic_op.X_magic.shape[1] == 150 # make sure they're close-ish np.testing.assert_allclose(data_imputed_apprx, data_imputed_exact, atol=0.15) # make sure they're not identical assert np.any(data_imputed_apprx != data_imputed_exact)
def test_dremi(): magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False) # test DREMI: need numerical precision here magic_op.set_params(random_state=42) magic_op.fit(scdata_norm) dremi = magic_op.knnDREMI("VIM", "ZEB1", plot=True) np.testing.assert_allclose(dremi, 1.466004, atol=0.0000005)
def main(): usage = "" # TODO parser = OptionParser(usage=usage) parser.add_option("-o", "--out_file", help="File to write output H5 file") (options, args) = parser.parse_args() dataset_f = args[0] out_f = options.out_file with h5py.File(dataset_f, 'r') as in_f: print('Loading expression matrix from {}...'.format(dataset_f)) X = in_f['expression'][:] print('done.') print('Running MAGIC...') magic_operator = magic.MAGIC() magic_X = magic_operator.fit_transform(X) print('done.') print('Writing results to {}...'.format(out_f)) with h5py.File(out_f, 'w') as out_f: out_f.create_dataset('expression', data=magic_X, compression="gzip") # Copy other datasets to new H5 file for k in in_f.keys(): if k != 'expression': out_f.create_dataset(k, data=in_f[k][:])
def test_all_genes(): magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False, random_state=42) int_gene_magic = magic_op.fit_transform(scdata_norm, genes=[-2, -1]) magic_all_genes = magic_op.fit_transform(scdata_norm, genes="all_genes") assert scdata_norm.shape == magic_all_genes.shape int_gene_magic2 = magic_op.transform(scdata_norm, genes=[-2, -1]) np.testing.assert_allclose(int_gene_magic, int_gene_magic2, rtol=0.015)
def test_genes_str_int(): magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False) str_gene_magic = magic_op.fit_transform(scdata_norm, genes=["VIM", "ZEB1"]) int_gene_magic = magic_op.fit_transform(scdata_norm, graph=magic_op.graph, genes=[-2, -1]) assert str_gene_magic.shape[0] == scdata_norm.shape[0] np.testing.assert_array_equal(str_gene_magic, int_gene_magic)
def magic_process(matrix): magic_op = magic.MAGIC(knn=10) magiced = magic_op.fit_transform(matrix, genes="all_genes") print("after MAGIC:", magiced.shape, sum(magiced[magiced == 0].count(axis=1)) / sum(magiced.count())) print(magiced.head()) return magiced, magic_op
def magic_impute(adata, knn=5, t=2, verbose=0, **kwargs): logg.info( "To be used carefully. Magic has not yet been tested for this application." ) import magic magic_operator = magic.MAGIC(verbose=verbose, knn=knn, t=t, **kwargs) adata.layers["Ms"] = magic_operator.fit_transform(adata.layers["spliced"]) adata.layers["Mu"] = magic_operator.transform(adata.layers["unspliced"])
def main(data_path, n_rows): data = Data(data_path, n_rows) data.load_data() magic_operator = magic.MAGIC() X_magic = magic_operator.fit_transform(data.dataframe) filename = 'magic_data_{}_rows.npy' output_file = 'magic_data_from_{}_{}_rows.npy'.format( data_path.replace('.', '').replace('/', ''), n_rows) np.save(output_file, X_magic) print('data saved in', output_file)
def test_anndata(): try: anndata except NameError: # anndata not installed return scdata = anndata.read_csv("../data/test_data.csv") fast_magic_operator = magic.MAGIC(t='auto', a=None, k=10) sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes") assert np.all(sc_magic.var_names == scdata.var_names) assert np.all(sc_magic.obs_names == scdata.obs_names) sc_magic = fast_magic_operator.fit_transform(scdata, genes=['VIM', 'ZEB1']) assert np.all(sc_magic.var_names.values == np.array(['VIM', 'ZEB1'])) assert np.all(sc_magic.obs_names == scdata.obs_names)
def run_MAGIC(train_adata, test_adata): import magic train_magic_op = magic.MAGIC() if scipy.sparse.issparse(train_adata.X): x_train = train_adata.X.toarray() else: x_train = train_adata.X train_emt_magic = train_magic_op.fit_transform(x_train, genes='all_genes') train_adata.X = train_emt_magic ## standardize the input sc.pp.scale(train_adata, zero_center=True, max_value=6) test_magic_op = magic.MAGIC() if scipy.sparse.issparse(test_adata.X): x_test = test_adata.X.toarray() else: x_test = test_adata.X test_emt_magic = test_magic_op.fit_transform(x_test, genes='all_genes') test_adata.X = test_emt_magic ## standardize the input sc.pp.scale(test_adata, zero_center=True, max_value=6) return train_adata, test_adata
def MAGIC(data): """"Adaptor method to call MAGIC to impute For this manuscript, we used default parameters to call MAGIC develope dby David van Dijk, et al., 2017. Parameter: --------- data: data frame, data to be imputed Return: ------ Imputed gene expression as data frame. """ magic_operator = magic.MAGIC(verbose=0) return magic_operator.fit_transform(data)
def test_anndata(): try: anndata except NameError: # anndata not installed return scdata = anndata.read_csv(data_path) fast_magic_operator = magic.MAGIC( t="auto", solver="approximate", decay=None, knn=10, verbose=False ) sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes") assert np.all(sc_magic.var_names == scdata.var_names) assert np.all(sc_magic.obs_names == scdata.obs_names) sc_magic = fast_magic_operator.fit_transform(scdata, genes=["VIM", "ZEB1"]) assert np.all(sc_magic.var_names.values == np.array(["VIM", "ZEB1"])) assert np.all(sc_magic.obs_names == scdata.obs_names)
def test_scdata(): scdata = pd.read_csv("../data/test_data.csv") scdata_norm = magic.preprocessing.library_size_normalize(scdata) assert scdata.shape == scdata_norm.shape fast_magic_operator = magic.MAGIC(t='auto', a=20, k=10) str_gene_magic = fast_magic_operator.fit_transform( scdata_norm, genes=['VIM', 'ZEB1']) int_gene_magic = fast_magic_operator.fit_transform( scdata_norm, genes=[-2, -1]) assert str_gene_magic.shape[0] == scdata_norm.shape[0] assert np.all(str_gene_magic == int_gene_magic) pca_magic = fast_magic_operator.fit_transform( scdata_norm, genes="pca_only") assert pca_magic.shape[0] == scdata_norm.shape[0] assert pca_magic.shape[1] == fast_magic_operator.n_pca fast_magic = fast_magic_operator.fit_transform(scdata_norm, genes="all_genes") assert scdata_norm.shape == fast_magic.shape
def impute_magic_expression(expression_matrix, meta_data, **kwargs): """ Use MAGIC (van Dijk et al Cell, 2018, 10.1016/j.cell.2018.05.061) to impute data :param expression_matrix: pd.DataFrame :param meta_data: pd.DataFrame :return imputed, meta_data: pd.DataFrame, pd.DataFrame """ kwargs, random_seed, output_file = process_impute_args(**kwargs) import magic utils.Debug.vprint('Imputing data with MAGIC ... ') imputed = pd.DataFrame(magic.MAGIC(random_state=random_seed, **kwargs).fit_transform(expression_matrix.values), index=expression_matrix.index, columns=expression_matrix.columns) if output_file is not None: imputed.to_csv(output_file, sep="\t") return imputed, meta_data
def test_scdata(): scdata = scprep.io.load_csv("../data/test_data.csv") scdata = scprep.filter.remove_empty_cells(scdata) scdata = scprep.filter.remove_empty_genes(scdata) scdata_norm = scprep.normalize.library_size_normalize(scdata) scdata_norm = scprep.transform.sqrt(scdata_norm) assert scdata.shape == scdata_norm.shape np.random.seed(42) magic_op = magic.MAGIC(t='auto', a=20, k=10) str_gene_magic = magic_op.fit_transform(scdata_norm, genes=['VIM', 'ZEB1']) int_gene_magic = magic_op.fit_transform(scdata_norm, genes=[-2, -1]) assert str_gene_magic.shape[0] == scdata_norm.shape[0] assert np.all(str_gene_magic == int_gene_magic) pca_magic = magic_op.fit_transform(scdata_norm, genes="pca_only") assert pca_magic.shape[0] == scdata_norm.shape[0] assert pca_magic.shape[1] == magic_op.n_pca magic_all_genes = magic_op.fit_transform(scdata_norm, genes="all_genes") assert scdata_norm.shape == magic_all_genes.shape dremi = magic_op.knnDREMI("VIM", "ZEB1", plot=True) np.testing.assert_allclose(dremi, 1.5687165, atol=0.0000005)
def main(count_table, out_file): print("started main") data = scprep.io.load_csv(count_table, cell_axis='column', delimiter='\t') print("loaded csv") # normalize with our method gtot = data.apply(sum, 0) ctot = data.apply(sum, 1) data_filt = data.loc[ctot >= 200, gtot >= 0] totu = data_filt.apply(lambda c: max(1, sum(c)), 1) data_norm = data_filt.div(totu, axis=0) * 1000 print(data_norm.apply(sum, 0).head()) print("normalized.") magic_op = magic.MAGIC() fig, ax = plt.subplots() magic_op.fit_transform(data_norm, plot_optimal_t=True, ax=ax) plt.savefig(out_file)
print('... full PHATE in {:.2f}-min'.format((time.time() - start)/60)) if True : # MELD adata.obs['res_sca1']=[1 if i=='SCA1' else -1 for i in adata.obs['genotype']] adata.obs['ees_sca1']=meld.MELD().fit_transform(G=G,RES=adata.obs['res_sca1']) adata.obs['ees_sca1']=adata.obs['ees_sca1']-adata.obs['ees_sca1'].mean() # mean center if True : # save adata obj with batch correction adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad')) print('\n... saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S')) if True : # MAGIC magic_op=magic.MAGIC().fit(X=adata.X,graph=G) # running fit_transform produces wrong shape adata.layers['imputed_bbknn']=magic_op.transform(adata.X,genes='all_genes') # adata.layers['imputed_bbknn']=sparse.csr_matrix(magic_op.transform(adata.X,genes='all_genes')) # causes memory spike if True : # save adata obj with batch correction & imputation adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad')) print('\n... saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S')) print('Pre-processing dataset took {:.2f}-min'.format((time.time() - total)/60)) elif False : # save data objects start=time.time() adata.write(os.path.join(pdfp,'mouse_MT_bbknn.h5ad')) print('saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
try: os.makedirs(dire_name) except OSError as e: if e.errno != errno.EEXIST: raise e args = parse_args() if args.t != "auto": args.t = int(args.t) print("run with these parametres: %s" % str(args)) # Main Part X = pd.read_csv(args.input, index_col=0) X = X.transpose() magic_operator = magic.MAGIC(k=args.k, a=args.a, t=args.t, n_pca=args.n_pca, knn_dist=args.knn_dist, n_jobs=args.n_jobs) X_magic = magic_operator.fit_transform(X, genes="all_genes") X_magic = X_magic.transpose() make_sure_dir_exists(args.outputdir) X_magic.to_csv(os.path.join(args.outputdir, "magic_output.csv"))
import magic import pandas as pd import matplotlib.pyplot as plt X = pd.read_csv('/home/rohit/Desktop/MAGIC-master/data/test_data.csv') magic_operator = magic.MAGIC() X_magic = magic_operator.fit_transform(X, genes='all_genes') # plt.scatter(X_magic['VIM'], X_magic['CDH1'], c=X_magic['ZEB1'], s=1, cmap='inferno') # plt.show() # magic.plot.animate_magic(X, gene_x='VIM', gene_y='CDH1', gene_color='ZEB1', operator=magic_operator) X_magic.to_csv('~/Desktop/exampleOutput.csv', index = False)
def main(args): # set arguments data_path = args.data_dir input_path = args.input_dir res_dir = args.res_dir test_file = args.test_file moduleGene_file = args.moduleGene_file cm_file = args.stoichiometry_matrix sc_imputation = args.sc_imputation # choose cpu or gpu automatically device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # read data print("Starting load data...") geneExpr = pd.read_csv(input_path + '/' + test_file, index_col=0) geneExpr = geneExpr.T geneExpr = geneExpr * 1.0 if sc_imputation == True: magic_operator = magic.MAGIC() with warnings.catch_warnings(): warnings.simplefilter("ignore") geneExpr = magic_operator.fit_transform(geneExpr) if geneExpr.max().max() > 50: geneExpr = (geneExpr + 1).apply(np.log2) geneExprSum = geneExpr.sum(axis=1) stand = geneExprSum.mean() geneExprScale = geneExprSum / stand geneExprScale = torch.FloatTensor(geneExprScale.values).to(device) BATCH_SIZE = geneExpr.shape[0] moduleGene = pd.read_csv(data_path + '/' + moduleGene_file, sep=',', index_col=0) moduleLen = [ moduleGene.iloc[i, :].notna().sum() for i in range(moduleGene.shape[0]) ] moduleLen = np.array(moduleLen) cmMat = pd.read_csv(data_path + '/' + cm_file, sep=',', header=None) cmMat = cmMat.values cmMat = torch.FloatTensor(cmMat).to(device) print("Load data done.") print("Starting process data...") emptyNode = [] gene_names = geneExpr.columns cell_names = geneExpr.index.astype(str) n_modules = moduleGene.shape[0] n_genes = len(gene_names) n_cells = len(cell_names) n_comps = cmMat.shape[0] geneExprDf = pd.DataFrame(columns=['Module_Gene'] + list(cell_names)) for i in range(n_modules): genes = moduleGene.iloc[i, :].values.astype(str) genes = [g for g in genes if g != 'nan'] if not genes: emptyNode.append(i) continue temp = geneExpr.copy() temp.loc[:, [g for g in gene_names if g not in genes]] = 0 temp = temp.T temp['Module_Gene'] = ['%02d_%s' % (i, g) for g in gene_names] geneExprDf = geneExprDf.append(temp, ignore_index=True, sort=False) geneExprDf.index = geneExprDf['Module_Gene'] geneExprDf.drop('Module_Gene', axis='columns', inplace=True) X = geneExprDf.values.T X = torch.FloatTensor(X).to(device) #prepare data for constraint of module variation based on gene df = geneExprDf df.index = [i.split('_')[0] for i in df.index] df.index = df.index.astype( int ) # mush change type to ensure correct order, T column name order change! #module_scale = df.groupby(df.index).sum(axis=1).T # pandas version update module_scale = df.groupby(df.index).sum().T module_scale = torch.FloatTensor(module_scale.values / moduleLen) print("Process data done.") # ============================================================================= #NN torch.manual_seed(16) net = FLUX(X, n_modules, f_in=n_genes, f_out=1).to(device) optimizer = torch.optim.Adam(net.parameters(), lr=LEARN_RATE) #Dataloader dataloader_params = { 'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0, 'pin_memory': False } dataSet = MyDataset(X, geneExprScale, module_scale) train_loader = torch.utils.data.DataLoader(dataset=dataSet, **dataloader_params) # ============================================================================= # ============================================================================= print("Starting train neural network...") start = time.time() # training loss_v = [] loss_v1 = [] loss_v2 = [] loss_v3 = [] loss_v4 = [] net.train() timestr = time.strftime("%Y%m%d-%H%M%S") lossName = "./output/lossValue_" + timestr + ".txt" file_loss = open(lossName, "a") for epoch in tqdm(range(EPOCH)): loss, loss1, loss2, loss3, loss4 = 0, 0, 0, 0, 0 for i, (X, X_scale, m_scale) in enumerate(train_loader): X_batch = Variable(X.float().to(device)) X_scale_batch = Variable(X_scale.float().to(device)) m_scale_batch = Variable(m_scale.float().to(device)) out_m_batch, out_c_batch = net(X_batch, n_modules, n_genes, n_comps, cmMat) loss_batch, loss1_batch, loss2_batch, loss3_batch, loss4_batch = myLoss( out_m_batch, out_c_batch, lamb1=LAMB_BA, lamb2=LAMB_NG, lamb3=LAMB_CELL, lamb4=LAMB_MOD, geneScale=X_scale_batch, moduleScale=m_scale_batch) optimizer.zero_grad() loss_batch.backward() optimizer.step() loss += loss_batch.cpu().data.numpy() loss1 += loss1_batch.cpu().data.numpy() loss2 += loss2_batch.cpu().data.numpy() loss3 += loss3_batch.cpu().data.numpy() loss4 += loss4_batch.cpu().data.numpy() #print('epoch: %02d, loss1: %.8f, loss2: %.8f, loss3: %.8f, loss4: %.8f, loss: %.8f' % (epoch+1, loss1, loss2, loss3, loss4, loss)) file_loss.write( 'epoch: %02d, loss1: %.8f, loss2: %.8f, loss3: %.8f, loss4: %.8f, loss: %.8f. \n' % (epoch + 1, loss1, loss2, loss3, loss4, loss)) loss_v.append(loss) loss_v1.append(loss1) loss_v2.append(loss2) loss_v3.append(loss3) loss_v4.append(loss4) # ============================================================================= end = time.time() print("Training time: ", end - start) file_loss.close() plt.plot(loss_v, '--') plt.plot(loss_v1) plt.plot(loss_v2) plt.plot(loss_v3) plt.plot(loss_v4) plt.legend(['total', 'balance', 'negative', 'cellVar', 'moduleVar']) imgName = './' + res_dir + '/loss_' + timestr + ".png" plt.savefig(imgName) timeName = './' + res_dir + '/time_' + timestr + ".txt" f = open(timeName, "a") runTimeStr = str(end - start) f.write(runTimeStr) f.close() # Dataloader dataloader_params = { 'batch_size': 1, 'shuffle': False, 'num_workers': 0, 'pin_memory': False } dataSet = MyDataset(X, geneExprScale, module_scale) test_loader = torch.utils.data.DataLoader(dataset=dataSet, **dataloader_params) #testing fluxStatuTest = np.zeros((n_cells, n_modules), dtype='f') #float32 balanceStatus = np.zeros((n_cells, n_comps), dtype='f') net.eval() for epoch in range(1): loss, loss1, loss2 = 0, 0, 0 for i, (X, X_scale, _) in enumerate(test_loader): X_batch = Variable(X.float().to(device)) out_m_batch, out_c_batch = net(X_batch, n_modules, n_genes, n_comps, cmMat) # save data fluxStatuTest[i, :] = out_m_batch.detach().numpy() balanceStatus[i, :] = out_c_batch.detach().numpy() # save to file fileName = "./" + res_dir + "/module" + str(n_modules) + "_cell" + str(n_cells) + "_batch" + str(BATCH_SIZE) + \ "_LR" + str(LEARN_RATE) + "_epoch" + str(EPOCH) + "_SCimpute_" + str(sc_imputation)[0] + \ "_lambBal" + str(LAMB_BA) + "_lambSca" + str(LAMB_NG) + "_lambCellCor" + str(LAMB_CELL) + "_lambModCor_1e-2" + \ '_' + timestr + ".csv" setF = pd.DataFrame(fluxStatuTest) setF.columns = moduleGene.index setF.index = geneExpr.index.tolist() setF.to_csv(fileName) setB = pd.DataFrame(balanceStatus) setB.rename(columns=lambda x: x + 1) setB.index = setF.index balanceName = "./output/balance_" + timestr + ".csv" setB.to_csv(balanceName) print("scFEA job finished. Check result in the desired output folder.") return
'MIXL1 (ENSG00000185155)', 'MYCBP (ENSG00000214114)', 'NANOG (ENSG00000111704)', 'NES (ENSG00000132688)', 'NKX2-1 (ENSG00000136352)', 'NKX2-5 (ENSG00000183072)', 'NKX2-8 (ENSG00000136327)', 'NPAS1 (ENSG00000130751)', 'NR2F1-AS1 (ENSG00000237187)', 'OLIG1 (ENSG00000184221)', 'OLIG3 (ENSG00000177468)', 'ONECUT1 (ENSG00000169856)', 'ONECUT2 (ENSG00000119547)', 'OTX2 (ENSG00000165588)', 'PAX3 (ENSG00000135903)', 'PAX6 (ENSG00000007372)', 'PDGFRA (ENSG00000134853)', 'PECAM1 (ENSG00000261371)', 'POU5F1 (ENSG00000204531)', 'SATB1 (ENSG00000182568)', 'SIX2 (ENSG00000170577)', 'SIX3-AS1 (ENSG00000236502)', 'SIX6 (ENSG00000184302)', 'SOX13 (ENSG00000143842)', 'SOX10 (ENSG00000100146)', 'SOX15 (ENSG00000129194)', 'SOX17 (ENSG00000164736)', 'SOX9 (ENSG00000125398)', 'TTLL10 (ENSG00000162571)', 'TAL1 (ENSG00000162367)', 'TBX15 (ENSG00000092607)', 'TBX18 (ENSG00000112837)', 'TBX5 (ENSG00000089225)', 'TNNT2 (ENSG00000118194)', 'WT1 (ENSG00000184937)', 'ZBTB16 (ENSG00000109906)', 'ZIC2 (ENSG00000043355)', 'ZIC5 (ENSG00000139800)', 'ACTB (ENSG00000075624)', 'HAND1 (ENSG00000113196)'] import magic data_magic = magic.MAGIC().fit_transform(data, genes=full_marker_genes) data_phate = phate.PHATE().fit_transform(data) # alternative: umap.UMAP(), sklearn.manifold.TSNE() data_phate = pd.DataFrame(data_phate, index=data.index) plt.figure(figsize=(10,10)) scprep.plot.scatter2d(data_phate, c=metadata['sample'], figsize=(12,8), cmap="Spectral", ticks=False, label_prefix="PHATE") plt.savefig("phatedata.pdf") home = os.path.expanduser('./') file_path = os.path.join(home, 'EBT_counts.pkl.gz') if not os.path.exists(file_path): scprep.io.download.download_google_drive(id='1Xz0ONnRWp2MLC_R6r74MzNwaZ4DkQPcM', destination=os.path.dirname(file_path)) data = pd.read_pickle(file_path)
def run_magic_from_file( filename, # data loading params sparse=True, gene_names=None, cell_names=None, cell_axis=None, gene_labels=None, allow_duplicates=None, genome=None, metadata_channels=None, # filtering params min_library_size=2000, min_cells_per_gene=10, # normalization params library_size_normalize=True, transform='sqrt', pseudocount=None, cofactor=None, # kernel params knn=5, decay=15, n_pca=100, knn_dist='euclidean', n_jobs=1, random_state=42, verbose=1, # magic params t_magic='auto', genes=None, # output params output='magic.csv', validate=False): """Run MAGIC on a file Parameters ---------- filename : str Allowed types: csv, tsv, mtx, hdf5/h5 (10X format), directory/zip (10X format) sparse : bool (recommended: True for scRNAseq, False for CyTOF) Force data sparsity. If `None`, sparsity is determined by data type. gene_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says gene names are data headers, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, `False` means no gene names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says cell names are data headers, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, `False` means no cell names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_axis : {'row', 'column'} States whether cells are on rows or columns. If cell_axis=='row', data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of shape [n_genes, n_cells]. Only valid for filetype mtx and csv gene_labels : {'symbol', 'id', 'both'} Choice of gene labels for 10X data. Recommended: 'both' Only valid for directory, zip, hdf5, h5 allow_duplicates : bool Allow duplicate gene names in 10X data. Recommended: True Only valid for directory, zip, hdf5, h5 genome : str Genome name. Only valid for hdf5, h5 metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Names of channels in fcs data which are not real measurements. Only valid if datatype is fcs. min_library_size : int or `None`, optional (default: 2000) Cutoff for library size normalization. If `None`, library size filtering is not used min_cells_per_gene : int or `None`, optional (default: 10) Minimum non-zero cells for a gene to be used. If `None`, genes are not removed library_size_normalize : `bool`, optional (default: True) Use library size normalization transform : {'sqrt', 'log', 'arcsinh', None} How to transform the data. If `None`, no transformation is done pseudocount : float (recommended: 1) Number of pseudocounts to add to genes prior to log transformation cofactor : float (recommended: 5) Factor by which to divide genes prior to arcsinh transformation knn : int, optional, default: 10 number of nearest neighbors on which to build kernel decay : int, optional, default: 15 sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize random PCA If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages t_magic : int, optional, default: 'auto' power to which the diffusion operator is powered for MAGIC. This sets the level of diffusion. If 'auto', t is selected according to the Procrustes disparity of the diffused data genes : list or {"all_genes", "pca_only"}, optional (default: None) List of genes to return from MAGIC, either as integer indices or column names if input data is a pandas DataFrame. If "all_genes", the entire smoothed matrix is returned. If "pca_only", PCA on the smoothed data is returned. If None, the entire matrix is also returned, but a warning may be raised if the resultant matrix is very large. output : str, optional (default: 'magic.csv') Output CSV file to save smoothed data matrix """ # check arguments filetype = check_filetype(filename) load_fn, load_kws = check_load_args(filetype, sparse=sparse, gene_names=gene_names, cell_names=cell_names, cell_axis=cell_axis, gene_labels=gene_labels, allow_duplicates=allow_duplicates, genome=genome, metadata_channels=metadata_channels) transform_fn, transform_kws = check_transform_args(transform=transform, pseudocount=pseudocount, cofactor=cofactor) # set up logging # https://github.com/scottgigante/tasklogger tasklogger.set_level(verbose) # load data # example: scprep.io.load_csv("data.csv") # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io tasklogger.log_info("Loading data from {}...".format(filename)) data = load_fn(filename, **load_kws) data = scprep.sanitize.check_numeric(data, copy=True) tasklogger.log_info("Loaded {} cells and {} genes.".format( data.shape[0], data.shape[1])) # filter data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter if min_library_size is not None: tasklogger.log_info("Filtering cells by library size >= {}...".format( min_library_size)) data = scprep.filter.filter_library_size(data, cutoff=min_library_size) tasklogger.log_info("Retained {} cells.".format(data.shape[0])) if min_cells_per_gene is not None: tasklogger.log_info( "Filtering genes by min cells >= {}...".format(min_cells_per_gene)) data = scprep.filter.filter_rare_genes(data, min_cells=min_cells_per_gene) tasklogger.log_info("Retained {} genes.".format(data.shape[1])) # normalize data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize if library_size_normalize: tasklogger.log_info("Library size normalizing data...") data = scprep.normalize.library_size_normalize(data) # transform data # example: data = scprep.transform.sqrt(data) # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform if transform is not None: tasklogger.log_info("Applying {} transform...".format(transform)) data = transform_fn(data, **transform_kws) # run MAGIC # https://magic.readthedocs.io/ magic_op = magic.MAGIC(knn=knn, decay=decay, t=t_magic, n_pca=n_pca, knn_dist=knn_dist, n_jobs=n_jobs, random_state=random_state, verbose=verbose) magic_data = magic_op.fit_transform(data, genes=genes) # save as csv magic_data = pd.DataFrame(magic_data) if cell_axis in ['col', 'column']: magic_data = magic_data.T tasklogger.log_info("Saving data to {}...".format(output)) magic_data.to_csv(output) tasklogger.log_info("Complete.".format(output)) if validate: correct_magic_data = scprep.io.load_csv( 'https://raw.githubusercontent.com/KrishnaswamyLab/magic-docker/' 'master/magic-validate.csv', sparse=False) try: np.testing.assert_equal(scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data)) tasklogger.log_debug( "Validation complete, output is equal to expected") except AssertionError: np.testing.assert_allclose( scprep.utils.toarray(magic_data), scprep.utils.toarray(correct_magic_data), atol=1e-14) tasklogger.log_debug( "Validation complete, output is numerically equivalent to expected" )
'count': { 'input': snakemake.input['cmat'], 'output': snakemake.output['cmat'] }, 'tpm': { 'input': snakemake.input['tpm'], 'output': snakemake.output['tpm'] } } for key in data_holder.keys(): # read in data data = pd.read_csv(data_holder[key]['input'], index_col=0) # impute with magic magic_op = magic.MAGIC() imputed = magic_op.fit_transform(data.T, genes='all_genes') # write data imputed.to_csv(data_holder[key]['output']) if not CLUSTER: # plot non-imputed data orig_heatmap = sns.clustermap(data.T, z_score=1, cmap='Blues') plt.savefig( os.path.join(snakemake.params['plot_dir'], '{}_heatmap.png'.format(key))) plt.cla() # plot imputed data imputed_heatmap = sns.clustermap(imputed,
def test_pca_only(): magic_op = magic.MAGIC(t="auto", decay=20, knn=10, verbose=False) pca_magic = magic_op.fit_transform(scdata_norm, genes="pca_only") assert pca_magic.shape[0] == scdata_norm.shape[0] assert pca_magic.shape[1] == magic_op.n_pca
np.save("{}Phate2d.npy".format(data_name), phate_data[pth_idx]) ph.set_params(n_components=3) phate3_data = ph.transform() np.save("{}Phate3d.npy".format(data_name), phate3_data[pth_idx]) mnn_graph = graphtools.Graph(data.iloc[pth_idx], sample_idx=sample_labels[pth_idx], n_pca=100, knn=5, random_state=42, decay=15, kernel_symm='theta', theta=0.99) mg = magic.MAGIC(random_state=42, a=mnn_graph.decay, k=mnn_graph.knn - 1, n_pca=mnn_graph.n_pca) data_magic = mg.fit_transform(data, graph=mnn_graph) _ = mg.fit_transform(data) # reduce memory footprint del mg.graph.data del mg.graph.data_nu del mg.graph._kernel del mg.graph._diff_op del mg.graph.subgraphs del mg.graph.sample_idx with open('magic.pickle', 'wb') as handle: pickle.dump(mg, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(): parser = argparse.ArgumentParser() run_group = parser.add_argument_group("run", description="Per-run parameters") run_group.add_argument("--seed", type=int, required=True) run_group.add_argument("--data_split", type=float, default=0.9, help="Split for self-supervision") run_group.add_argument("--n_trials", type=int, default=10, help="Number of times to resample") run_group.add_argument("--median_scale", action="store_true") data_group = parser.add_argument_group( "data", description="Input and output parameters") data_group.add_argument("--dataset", type=pathlib.Path, required=True) data_group.add_argument("--output_dir", type=pathlib.Path, required=True) data_group.add_argument("--genes", type=int, nargs="+", required=True, help="Genes to smooth (indices)") model_group = parser.add_argument_group( "model", description= "Model parameters. [max] or [min, max] or [min, max, interval]", ) model_group.add_argument( "--neighbors", type=int, nargs="+", default=(1, 11), metavar="K", help="Number of neighbors in kNN graph", ) model_group.add_argument( "--components", type=int, nargs="+", default=(5, 51, 5), metavar="PC", help="Maximum number of components to compute", ) model_group.add_argument( "--time", type=int, nargs="+", default=(1, 6), metavar="T", help="Number of time steps for diffusion", ) args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler()) dataset_name = args.dataset.parent.name output_file = args.output_dir / f"{dataset_name}_magic_mse_{args.seed}.pickle" logger.info(f"writing output to {output_file}") seed = sum(map(ord, f"biohub_{args.seed}")) random_state = np.random.RandomState(seed) with open(args.dataset, "rb") as f: true_means, true_counts, umis = pickle.load(f) k_range = np.arange(*args.neighbors) pc_range = np.arange(*args.components) t_range = np.arange(*args.time) rec_loss = dict() mcv_loss = dict() # run n_trials for self-supervised sweep for i in range(args.n_trials): umis_X, umis_Y = ut.split_molecules(umis, args.data_split, 0.0, random_state) if args.median_scale: median_count = np.median(umis.sum(axis=1)) umis_X = umis_X / umis_X.sum(axis=1, keepdims=True) * median_count umis_Y = umis_Y / umis_Y.sum(axis=1, keepdims=True) * median_count else: umis_Y = umis_Y * args.data_split / (1 - args.data_split) for n_pcs in pc_range: for k in k_range: for t in t_range: magic_op = magic.MAGIC(n_pca=n_pcs, verbose=0) magic_op.set_params(knn=k, t=t) denoised = magic_op.fit_transform(umis_X, genes=args.genes) denoised = np.maximum(denoised, 0) rec_loss[i, n_pcs, k, t] = mean_squared_error(denoised, umis_X[:, args.genes]) mcv_loss[i, n_pcs, k, t] = mean_squared_error(denoised, umis_Y[:, args.genes]) results = { "dataset": dataset_name, "method": "magic", "loss": "mse", "normalization": "sqrt", "param_range": [pc_range, k_range, t_range], "rec_loss": rec_loss, "mcv_loss": mcv_loss, } with open(output_file, "wb") as out: pickle.dump(results, out)
def magic_impute(self, data): import magic model = magic.MAGIC(n_jobs=self.ncores) imputed = model.fit_transform(data.values) return pd.DataFrame(imputed)
denoised_name = f"{outputDir}/{experiment_name}_denoised_{algorithm}.csv" print(denoised_name) if os.path.isfile(denoised_name): continue adata = sc.read(f"{inputDir}/{f}") adata = adata.transpose() adata.X = np.expm1(adata.X) sc.pp.sqrt(adata) n = find_pca_comp( adata, figName=f"{outputDir}/figures/{experiment_name}_variance.png", figTitle=f'{experiment_name} Explained Variance') magic_op = magic.MAGIC(t=6, n_pca=n) start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) (mem_registered, adata_denoised) = memory_usage( (magic_op.fit_transform, (adata, ), { 'genes': 'all_genes' }), retval=True, max_usage=True, include_children=True) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() real = end_time - start_time systime = end_resources.ru_stime - start_resources.ru_stime usertime = end_resources.ru_utime - start_resources.ru_utime cpu_time = systime + usertime
def main(): usage = "" # TODO parser = OptionParser(usage=usage) #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") parser.add_option("-o", "--out_file", help="File to write H5 dataset") (options, args) = parser.parse_args() dataset_f = args[0] resolution = float(args[1]) out_f = options.out_file with h5py.File(dataset_f, 'r') as in_f: print('Loading expression matrix from {}...'.format(dataset_f)) X = in_f['expression'][:] cell_ids = [ str(x)[2:-1] for x in in_f['experiment'][:] ] gene_ids = [ str(x)[2:-1] for x in in_f['gene_id'][:] ] ad = AnnData( X=X, obs=pd.DataFrame( data=cell_ids, columns=['cell'] ), var=gene_ids ) sc.pp.neighbors(ad) sc.tl.leiden(ad, resolution=resolution) new_X = None new_cell_ids = [] clusters = [] for clust in sorted(set(ad.obs['leiden'])): print('Processing cluster {}'.format(clust)) indices = [ int(x) for x in ad.obs.loc[ad.obs['leiden'] == clust].index ] X = ad.X[indices] print('Shape of cluster matrix: {}'.format(X.shape)) magic_operator = magic.MAGIC() magic_X = magic_operator.fit_transform(X) if new_X is None: new_X = magic_X else: new_X= np.concatenate([new_X, magic_X]) print('Current shape of final matrix: {}'.format(new_X.shape)) clusters += [clust for i in indices] new_cell_ids += list(np.array(cell_ids)[indices]) clusters = [ x.encode('utf-8') for x in clusters ] new_cell_ids = [ x.encode('utf-8') for x in new_cell_ids ] print('Writing results to {}...'.format(out_f)) with h5py.File(out_f, 'w') as out_f: out_f.create_dataset( 'expression', data=new_X, compression="gzip" ) out_f.create_dataset('cluster', data=clusters) out_f.create_dataset('experiment', data=new_cell_ids) # Copy other datasets to new H5 file for k in in_f.keys(): if k != 'expression' and k != 'experiment': out_f.create_dataset(k,data=in_f[k][:])
import magic import numpy as np import pandas as pd bmmsc_data = magic.io.load_csv('MATRIX.txt') libsize = bmmsc_data.sum(axis=1) bmmsc_data = magic.preprocessing.library_size_normalize(bmmsc_data) bmmsc_data = np.sqrt(bmmsc_data) bmmsc_data.head() magic_op = magic.MAGIC(t=4, k=5) bmmsc_magic = magic_op.fit_transform(bmmsc_data, genes='all_genes') bmmsc_magic.head() bmmsc_magic.to_csv('magic.csv', sep='\t')