def main(mode: Literal['train', 'evaluate'], fn_filter=lambda job: True): jobs = product(*[[(k, i) for i in v] for k, v in configs.items()]) jobs = [dict(j) for j in jobs] jobs = [j for j in jobs if fn_filter(j)] print(f'Start {mode} {len(jobs)} jobs ...') ######## training mode if mode == 'train': for _ in MPI(jobs, partial(run_task, evaluation=False), ncpu=N_CPU, batch=1): pass ######## evaluation mode elif mode == 'evaluate': for j in jobs: if j['ds'] == 'celeba' and j['coef'].vmin == 0.1: # print(j) run_task(j, evaluation=True) ######## others else: raise NotImplementedError(f'No support for mode="{mode}"')
corr_count = sorted(corr_count.items(), key=lambda x: x[-1], reverse=True) return ds.X.shape[0], var_names, corr, corr_count # =========================================================================== # Extract the minimum correlation # =========================================================================== dsname = list(get_dataset_meta().keys()) correlation = defaultdict(float) count = defaultdict(float) occurrence = defaultdict(int) total_cell = 0 for results in MPI(jobs=dsname, func=get_corr, ncpu=4, batch=1): if results is None: continue n, var_names, corr, corr_count = results total_cell += n ## occurrence for v in var_names: occurrence[v] += 1 ## correlation for i, j in corr: if i[0] == i[1]: continue correlation[i] += j ## count for i, j in corr_count: if i[0] == i[1]:
if __name__ == '__main__': config = ArgumentParser() config.add_argument('mode', type=int) config.add_argument('--overwrite', action='store_true') config.add_argument('-ncpu', type=int, default=1) config = config.parse_args() jobs = [Arguments(beta=b, gamma=1, zdim=ZDIM, finetune=True, overwrite=config.overwrite) for b in BETA] + \ [Arguments(beta=b, gamma=1, zdim=ZDIM, finetune=False, overwrite=config.overwrite) for b in BETA] mode = config.mode # === 1. train if mode == 0: for r in MPI(jobs=jobs, func=train, ncpu=config.ncpu): pass # === 2. eval elif mode == 1: cache_path = get_cache_path() if os.path.exists(cache_path) and config.overwrite: os.remove(cache_path) if not os.path.exists(cache_path): df = [] for r in MPI(jobs=jobs, func=evaluate, ncpu=config.ncpu): if r is not None: df.append(r) df = sorted(df, key=lambda x: x['beta']) df = pd.DataFrame(df) with open(cache_path, 'wb') as f: pickle.dump(df, f)
parser.add_argument('-ncpu', type=int, default=1) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--no-anno', action='store_true') # === 1. prepare args = parser.parse_args() ncpu = args.ncpu if ncpu <= 0: ncpu = cpu_count() - 1 jobs = [ Job(beta=b, gamma=g, zdim=z) for b, g, z in itertools.product(BETA, GAMMA, ZDIM) ] OVERWRITE = args.overwrite # === 2. training if args.mode == 0: for _ in MPI(jobs, training, ncpu=ncpu): pass # === 3. evaluating reconstruction and sampling elif args.mode == 2: path = get_cache_path(suffix='_reconstruction') if not os.path.exists(path): progress = tqdm(total=len(jobs), desc='Evaluating Reconstruction') df = [] for results in MPI(jobs, evaluate_reconstruction, ncpu=ncpu): progress.update(1) if results is None: continue df.append(results) progress.close() df = pd.DataFrame(df) with open(path, 'wb') as f:
os.mkdir(wav_path) cmds = [ "sph2pipe %s %s -f rif" % (path, os.path.join(wav_path, get_name(path))) for path in audio_files ] def mpi_fn(cmd): exec_commands(cmd, print_progress=False) yield len(cmd) prog = Progbar(target=len(cmds), print_report=True, print_summary=True, name='Converting .sph to .wav') # run the MPI tasks mpi = MPI(jobs=cmds, func=mpi_fn, ncpu=cpu_count() - 1, batch=12) for i in mpi: prog.add(i) # =========================================================================== # Extract Acoustic features # =========================================================================== jobs = get_all_files(wav_path, filter_func=lambda x: '.wav' == x[-4:]) assert len(jobs) == TOTAL_FILES # ====== configuration ====== # if not os.path.exists(outpath) or args.ds: extractors = pp.make_pipeline(steps=[ pp.speech.AudioReader(sr=None, sr_new=8000, best_resample=True, remove_dc=True), pp.base.Converter(
gamma=gamma, beta=beta) gym.plot_correlation() gym.plot_latents_stats() gym.plot_latents_tsne() gym.save_figures(save_path + '.pdf', verbose=True) return results results_path = os.path.join(root_path, 'results') jobs = list( itertools.product(np.linspace(0.1, 100, num=30), np.linspace(0.1, 100, num=30))) if not os.path.exists(results_path): data = [] for results in MPI(jobs, func=test_vae_y, ncpu=cpu_count() - 1): data.append(results) df = pd.DataFrame(data) with open(results_path, 'wb') as f: pickle.dump(df, f) else: with open(results_path, 'rb') as f: df = pickle.load(f) df: pd.DataFrame print(df) for name in ['acc', 'llk', 'kl', 'au']: plt.figure(figsize=(9, 8), dpi=150) splot = sns.scatterplot(x='beta', y='gamma',
def make_halfmoons(n_samples_per_factors=200, image_size=64, marker_size=12., seed=1, n_cpu=4): from matplotlib import pyplot as plt from odin.utils import MPI from tqdm import tqdm rand = np.random.RandomState(seed=seed) shapes = ['o', 's', '^', 'p'] shapes_to_idx = {v: k for k, v in enumerate(shapes)} colors = np.linspace(0.0, 1.0, num=10) n_factors = len(shapes) * len(colors) n_samples = n_samples_per_factors * n_factors shapes = np.tile(shapes, [int(n_samples / len(shapes))]) colors = np.tile(colors, [int(n_samples / len(colors))]) rand.shuffle(shapes) rand.shuffle(colors) # === 1. Generate data x, y = datasets.make_moons(n_samples=n_samples, shuffle=True, noise=.05, random_state=rand.randint(1e8)) x_min = np.min(x, 0, keepdims=True) x_max = np.max(x, 0, keepdims=True) x = (x - x_min) / (x_max - x_min) * 2. - 1. # === 2. Helper dpi = 200 cmap = plt.get_cmap('coolwarm') def create_image(ids: List[int]): all_x = [] all_y = [] for i in ids: fig = plt.figure(figsize=(image_size / dpi, image_size / dpi), dpi=dpi, facecolor="black", frameon=True) ax = plt.gca() ax.set_facecolor('black') ax.scatter(x[i, 0], x[i, 1], s=marker_size, marker=shapes[i], color=cmap(colors[i]), antialiased=True, edgecolors='none') ax.set_xlim([-1.2, 1.2]) ax.set_ylim([-1.2, 1.2]) ax.axis('off') ax.margins(0) fig.tight_layout(pad=0) # convert to array fig.canvas.draw() img = np.frombuffer(fig.canvas.tostring_rgb(), np.uint8) img = np.reshape(img, (image_size, image_size, 3)) # img = np.asarray(fig.canvas.buffer_rgba())[:, :, :3] plt.close(fig) # save data all_x.append(np.expand_dims(img, 0)) all_y.append([ x[i, 0], x[i, 1], y[i], colors[i] * 2. - 1., shapes_to_idx[shapes[i]] ]) return np.concatenate(all_x, 0), np.vstack(all_y) # === 2. Generate images jobs = list(range(n_samples)) progress = tqdm(total=n_samples, unit='images') X = [] Y = [] for img, lab in MPI(jobs, create_image, ncpu=n_cpu, batch=100): progress.update(img.shape[0]) X.append(img) Y.append(lab) progress.clear() progress.close() return np.concatenate(X, 0), np.concatenate(Y, 0)
def read_dataset10x(name, filtered_cells=True, filtered_genes=True, override=False, verbose=True) -> SingleCellOMIC: r""" Predefined procedure for download and preprocessing 10x dataset into `SingleCellOMIC` i.e. scanpy.AnnData object Reference: https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html """ ### prepare the URL name = str(name).lower().strip() spec = 'filtered' if filtered_cells else 'raw' flatten_datasets = [(exp, version, dsname) for exp, i in all_datasets.items() for version, j in i.items() for dsname in j] found = [] for exp, version, dsname in flatten_datasets: if name == dsname: found.append((exp, version, dsname)) if not found: raise ValueError(f"Cannot find data with name {name}, " f"all available datasets are: {flatten_datasets}") if len(found) > 1: raise RuntimeError( f"Found multiple datasets {found} with name='{name}'") exp, version, name = found[0] dataset_name = name + '_' + spec url = group_to_url_skeleton[exp][version].format(version, name, name, spec) ### prepare the output path filename = os.path.basename(url) # download path download_path = os.path.join(DOWNLOAD_DIR, exp, version) if not os.path.exists(download_path): os.makedirs(download_path) # preprocessing path preprocessed_path = os.path.join(DATA_DIR, f'10x_{exp}_{name}_{spec}_preprocessed') if override and os.path.exists(preprocessed_path): if verbose: print("Overriding path: %s" % preprocessed_path) shutil.rmtree(preprocessed_path) if not os.path.exists(preprocessed_path): os.mkdir(preprocessed_path) # ******************** preprocessed ******************** # if len(os.listdir(preprocessed_path)) == 0: if verbose: print("Dataset10X:") print(" Meta :", found) print(" File :", filename) print(" URL :", url) print(" Download :", download_path) print(" Preprocess :", preprocessed_path) ### download the tar file path = download_file(url=url, filename=os.path.join(download_path, filename), override=False, md5=_MD5.get(f"{exp}*{version}*{name}*{spec}", None)) if not tarfile.is_tarfile(path): raise RuntimeError("Expecting tarfile but received: %s" % path) contents = {} with tarfile.open(path, mode="r:gz") as f: all_files = [(path, info.name, info.size, verbose) for info in f if info.isfile()] for name, data in MPI(jobs=all_files, func=_read_tarinfo, batch=1, ncpu=4): contents[name] = data # cell barcodes barcodes = contents['barcodes'] ### cell-atac if exp == 'cell-atac': n_top_genes = 20000 # this is ad-hoc value X = contents['matrix'].T.todense() peaks = contents['peaks'] X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype( np.float32) X_col_name = np.array([':'.join(i) for i in peaks]) save_data = [(OMIC.atac.name, X)] save_metadata = dict(main_omic=OMIC.atac.name, barcodes=barcodes, chromatin_var=X_col_name) sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.atac, name=name) ### cell-exp and cell-vdj elif exp in ('cell-exp', 'cell-vdj'): n_top_genes = 2000 # feature (Id, Name, Type(antibody or gene-expression)) X_col = contents[ 'features'] if 'features' in contents else contents['genes'] # data matrix X = contents['matrix'].T if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'): X = X.tocsr() X = X.astype('float32') assert X.shape[0] == barcodes.shape[0] and X.shape[ 1] == X_col.shape[0] # antibody and gene are provided prot_ids = [] pmhc_ids = [] gene_ids = [] if X_col.shape[1] == 3: for idx, (feat_id, feat_name, feat_type) in enumerate(X_col): if feat_type == 'Antibody Capture': if exp == "cell-vdj" and "_TotalSeqC" not in feat_name: pmhc_ids.append(idx) else: prot_ids.append(idx) elif feat_type == 'Gene Expression': gene_ids.append(idx) else: raise ValueError( f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}" ) elif X_col.shape[1] == 2: gene_ids = slice(None, None) else: raise ValueError(f"No support for features matrix\n{X_col}") # Antibody ID, Antibody Name y = X[:, prot_ids] y_col = X_col[prot_ids][:, 0] # the id y_col_name = X_col[prot_ids][:, 1] # the name # pMHC peptide if len(pmhc_ids) > 0: z = X[:, pmhc_ids] z_col = X_col[pmhc_ids][:, 0] # the id z_col_name = X_col[pmhc_ids][:, 1] # the name # Gene ID, Gene Name X = X[:, gene_ids].todense() X_col_name = X_col[gene_ids][:, 1] # the name X_col = X_col[gene_ids][:, 0] # the id assert np.min(X) >= 0 and np.max(X) < 65000, \ f"Only support uint16 data type, given data with max={np.max(X)}" # data and metadata sco = SingleCellOMIC(X, cell_id=barcodes, gene_id=X_col_name, omic=OMIC.transcriptomic, name=name) save_data = [(OMIC.transcriptomic.name, X), (OMIC.proteomic.name, y)] save_metadata = { 'main_omic': OMIC.transcriptomic.name, 'barcodes': barcodes, f"{OMIC.transcriptomic.name}_var": X_col_name, f"{OMIC.proteomic.name}_var": y_col_name } if len(pmhc_ids) > 0: save_data.append((OMIC.pmhc.name, z)) save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name ### others else: raise NotImplementedError(f"No support for experiment: {exp}") ### save data and metadata for name, data in save_data: outpath = os.path.join(preprocessed_path, name) n_samples, n_features = data.shape if n_samples == 0 or n_features == 0: continue with MmapArrayWriter(outpath, shape=(0, n_features), dtype=np.uint16, remove_exist=True) as f: if verbose: prog = tqdm(f"Saving {outpath}", total=n_samples, unit='samples') for s, e in batching(batch_size=5120, n=n_samples): x = data[s:e] if hasattr(x, 'todense'): x = x.todense() f.write(x) if verbose: prog.update(e - s) if verbose: prog.clear() prog.close() # save metadata outpath = os.path.join(preprocessed_path, 'metadata') with open(outpath, 'wb') as f: pickle.dump(save_metadata, f) if verbose: print(f"Saved metadata to path {outpath}") ### filter genes, follow 10x and use Cell Ranger recipe, # this is copied from Scanpy n_genes = sco.shape[1] sc.pp.filter_genes(sco, min_counts=1) # normalize with total UMI count per cell sc.pp.normalize_total(sco, key_added='n_counts_all') filter_result = sc.pp.filter_genes_dispersion(sco.X, flavor='cell_ranger', n_top_genes=n_top_genes, log=False) gene_subset = filter_result.gene_subset indices = sco.get_var_indices() markers = (MARKER_GENES if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC) for name in markers: idx = indices.get(name, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) # filter genes if verbose: print( f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.") with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f: pickle.dump(sco.var_names.values, f) # ******************** load and return the dataset ******************** # omics = [ name for name in os.listdir(preprocessed_path) if name not in ('metadata', 'top_genes') and '_' not in name ] with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f: metadata = pickle.load(f) with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f: top_genes = pickle.load(f) data = { name: MmapArray(os.path.join(preprocessed_path, name)).astype(np.float32) for name in omics } main_omic = metadata['main_omic'] X = data[main_omic] var_names = metadata[f'{main_omic}_var'] if filtered_genes: var_ids = {j: i for i, j in enumerate(var_names)} ids = [var_ids[i] for i in top_genes] X = X[:, ids] var_names = var_names[ids] sco = SingleCellOMIC( X, cell_id=metadata['barcodes'], gene_id=var_names, omic=main_omic, name=f"{dataset_name}{'' if filtered_genes else 'all'}") for o in omics: if o != main_omic: sco.add_omic(omic=o, X=data[o], var_names=np.asarray(metadata[f'{o}_var'])) return sco
def read_leukemia_MixedPhenotypes(filtered_genes=True, omic='rna', ignore_na=True, override=False, verbose=True) -> SingleCellOMIC: r""" Integrates highly multiplexed protein quantification, transcriptome profiling, and chromatin accessibility analysis. Using this approach, we establish a normal epigenetic baseline for healthy blood development, which we then use to deconvolve aberrant molecular features within blood from mixed-phenotype acute leukemia (MPAL) patients. scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow, peripheral blood, and MPAL donors References: Granja JM et al., 2019. "Single-cell multiomic analysis identifies regulatory programs in mixed-phenotype acute leukemia". Nature Biotechnology. https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369 https://github.com/GreenleafLab/MPAL-Single-Cell-2019 """ ### prepare the path download_dir = os.path.join(DOWNLOAD_DIR, 'mpal') if not os.path.exists(download_dir): os.makedirs(download_dir) preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed') if override: shutil.rmtree(preprocessed_path) if verbose: print(f"Override preprocessed data at {preprocessed_path}") if not os.path.exists(preprocessed_path): os.makedirs(preprocessed_path) ### download files = {} for name, (url, md5) in _URL.items(): path = download_file(url=url, filename=os.path.join(download_dir, os.path.basename(url)), override=False, md5=md5) files[name] = path ### read the files if omic == 'atac': del files['rna'] del files['adt'] elif omic == 'rna': del files['atac'] else: raise NotImplementedError(f"No support for omic type: {omic}") all_data = {} for name, data in MPI(jobs=list(files.items()), func=partial(_read_data, verbose=True, preprocessed_path=preprocessed_path), batch=1, ncpu=4): all_data[name] = data.load() ### load scRNA and ADT if omic == 'rna': rna = all_data['rna'] adt = all_data['adt'] cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode'])) # barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])} ids = [barcode2ids[i] for i in cell_id] X_rna = rna.X[ids].astype(np.float32) classification = rna.celldata['ProjectClassification'][ids].values # barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])} X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32) # if filtered_genes: top_genes_path = os.path.join(preprocessed_path, 'top_genes') if os.path.exists(top_genes_path): with open(top_genes_path, 'rb') as f: top_genes = set(pickle.load(f)) ids = [i for i, j in enumerate(rna.genenames) if j in top_genes] sco = SingleCellOMIC(X_rna[:, ids], cell_id=cell_id, gene_id=rna.genenames[ids], omic=OMIC.transcriptomic, name='mpalRNA') else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNA') sc.pp.filter_cells(sco, min_genes=200) sc.pp.filter_genes(sco, min_cells=3) sc.pp.normalize_total(sco, target_sum=1e4) result = sc.pp.filter_genes_dispersion(sco.X, min_mean=0.0125, max_mean=3, min_disp=0.5, log=False, n_top_genes=2000) # make sure all marker genes are included gene_subset = result.gene_subset gene_indices = sco.get_var_indices() for gene in MARKER_GENES: idx = gene_indices.get(gene, None) if idx is not None: gene_subset[idx] = True sco._inplace_subset_var(gene_subset) with open(top_genes_path, 'wb') as f: pickle.dump(sco.var_names.values, f) else: sco = SingleCellOMIC(X_rna, cell_id=cell_id, gene_id=rna.genenames, omic=OMIC.transcriptomic, name='mpalRNAall') # loading dataset if ignore_na: ids = np.logical_not(np.isnan(np.max(X_adt, axis=0))) sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids]) else: sco.add_omic(OMIC.proteomic, X_adt, adt.genenames) y, labels = _celltypes(classification) sco.add_omic(OMIC.celltype, y, labels) exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values} sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names], dtype=np.float32) ### load ATAC else: atac = all_data['atac'] sco = SingleCellOMIC(atac.X.astype(np.float32), cell_id=atac.celldata['Barcode'], gene_id=atac.genenames, omic=OMIC.atac, name='mpalATAC') y, labels = _celltypes(atac.celldata['ProjectClassification'].values) sco.add_omic(OMIC.celltype, y, labels) sco.obs['clusters'] = atac.celldata['Clusters'].values sco.var['score'] = atac.genedata['score'].values return sco