Example #1
0
def main(mode: Literal['train', 'evaluate'], fn_filter=lambda job: True):
    jobs = product(*[[(k, i) for i in v] for k, v in configs.items()])
    jobs = [dict(j) for j in jobs]
    jobs = [j for j in jobs if fn_filter(j)]
    print(f'Start {mode} {len(jobs)} jobs ...')
    ######## training mode
    if mode == 'train':
        for _ in MPI(jobs,
                     partial(run_task, evaluation=False),
                     ncpu=N_CPU,
                     batch=1):
            pass
    ######## evaluation mode
    elif mode == 'evaluate':
        for j in jobs:
            if j['ds'] == 'celeba' and j['coef'].vmin == 0.1:
                # print(j)
                run_task(j, evaluation=True)
    ######## others
    else:
        raise NotImplementedError(f'No support for mode="{mode}"')
        corr_count = sorted(corr_count.items(),
                            key=lambda x: x[-1],
                            reverse=True)
        return ds.X.shape[0], var_names, corr, corr_count


# ===========================================================================
# Extract the minimum correlation
# ===========================================================================
dsname = list(get_dataset_meta().keys())
correlation = defaultdict(float)
count = defaultdict(float)
occurrence = defaultdict(int)
total_cell = 0

for results in MPI(jobs=dsname, func=get_corr, ncpu=4, batch=1):
    if results is None:
        continue
    n, var_names, corr, corr_count = results
    total_cell += n
    ## occurrence
    for v in var_names:
        occurrence[v] += 1
    ## correlation
    for i, j in corr:
        if i[0] == i[1]:
            continue
        correlation[i] += j
    ## count
    for i, j in corr_count:
        if i[0] == i[1]:

if __name__ == '__main__':
    config = ArgumentParser()
    config.add_argument('mode', type=int)
    config.add_argument('--overwrite', action='store_true')
    config.add_argument('-ncpu', type=int, default=1)
    config = config.parse_args()
    jobs = [Arguments(beta=b, gamma=1, zdim=ZDIM, finetune=True,
                      overwrite=config.overwrite) for b in BETA] + \
           [Arguments(beta=b, gamma=1, zdim=ZDIM, finetune=False,
                      overwrite=config.overwrite) for b in BETA]
    mode = config.mode
    # === 1. train
    if mode == 0:
        for r in MPI(jobs=jobs, func=train, ncpu=config.ncpu):
            pass
    # === 2. eval
    elif mode == 1:
        cache_path = get_cache_path()
        if os.path.exists(cache_path) and config.overwrite:
            os.remove(cache_path)
        if not os.path.exists(cache_path):
            df = []
            for r in MPI(jobs=jobs, func=evaluate, ncpu=config.ncpu):
                if r is not None:
                    df.append(r)
            df = sorted(df, key=lambda x: x['beta'])
            df = pd.DataFrame(df)
            with open(cache_path, 'wb') as f:
                pickle.dump(df, f)
Example #4
0
 parser.add_argument('-ncpu', type=int, default=1)
 parser.add_argument('--overwrite', action='store_true')
 parser.add_argument('--no-anno', action='store_true')
 # === 1. prepare
 args = parser.parse_args()
 ncpu = args.ncpu
 if ncpu <= 0:
     ncpu = cpu_count() - 1
 jobs = [
     Job(beta=b, gamma=g, zdim=z)
     for b, g, z in itertools.product(BETA, GAMMA, ZDIM)
 ]
 OVERWRITE = args.overwrite
 # === 2. training
 if args.mode == 0:
     for _ in MPI(jobs, training, ncpu=ncpu):
         pass
 # === 3. evaluating reconstruction and sampling
 elif args.mode == 2:
     path = get_cache_path(suffix='_reconstruction')
     if not os.path.exists(path):
         progress = tqdm(total=len(jobs), desc='Evaluating Reconstruction')
         df = []
         for results in MPI(jobs, evaluate_reconstruction, ncpu=ncpu):
             progress.update(1)
             if results is None:
                 continue
             df.append(results)
         progress.close()
         df = pd.DataFrame(df)
         with open(path, 'wb') as f:
    os.mkdir(wav_path)
    cmds = [
        "sph2pipe %s %s -f rif" %
        (path, os.path.join(wav_path, get_name(path))) for path in audio_files
    ]

    def mpi_fn(cmd):
        exec_commands(cmd, print_progress=False)
        yield len(cmd)

    prog = Progbar(target=len(cmds),
                   print_report=True,
                   print_summary=True,
                   name='Converting .sph to .wav')
    # run the MPI tasks
    mpi = MPI(jobs=cmds, func=mpi_fn, ncpu=cpu_count() - 1, batch=12)
    for i in mpi:
        prog.add(i)
# ===========================================================================
# Extract Acoustic features
# ===========================================================================
jobs = get_all_files(wav_path, filter_func=lambda x: '.wav' == x[-4:])
assert len(jobs) == TOTAL_FILES
# ====== configuration ====== #
if not os.path.exists(outpath) or args.ds:
    extractors = pp.make_pipeline(steps=[
        pp.speech.AudioReader(sr=None,
                              sr_new=8000,
                              best_resample=True,
                              remove_dc=True),
        pp.base.Converter(
Example #6
0
                       gamma=gamma,
                       beta=beta)
        gym.plot_correlation()
        gym.plot_latents_stats()
        gym.plot_latents_tsne()
    gym.save_figures(save_path + '.pdf', verbose=True)
    return results


results_path = os.path.join(root_path, 'results')
jobs = list(
    itertools.product(np.linspace(0.1, 100, num=30),
                      np.linspace(0.1, 100, num=30)))
if not os.path.exists(results_path):
    data = []
    for results in MPI(jobs, func=test_vae_y, ncpu=cpu_count() - 1):
        data.append(results)
    df = pd.DataFrame(data)
    with open(results_path, 'wb') as f:
        pickle.dump(df, f)
else:
    with open(results_path, 'rb') as f:
        df = pickle.load(f)

df: pd.DataFrame
print(df)

for name in ['acc', 'llk', 'kl', 'au']:
    plt.figure(figsize=(9, 8), dpi=150)
    splot = sns.scatterplot(x='beta',
                            y='gamma',
Example #7
0
def make_halfmoons(n_samples_per_factors=200,
                   image_size=64,
                   marker_size=12.,
                   seed=1,
                   n_cpu=4):
    from matplotlib import pyplot as plt
    from odin.utils import MPI
    from tqdm import tqdm

    rand = np.random.RandomState(seed=seed)
    shapes = ['o', 's', '^', 'p']
    shapes_to_idx = {v: k for k, v in enumerate(shapes)}
    colors = np.linspace(0.0, 1.0, num=10)
    n_factors = len(shapes) * len(colors)
    n_samples = n_samples_per_factors * n_factors

    shapes = np.tile(shapes, [int(n_samples / len(shapes))])
    colors = np.tile(colors, [int(n_samples / len(colors))])
    rand.shuffle(shapes)
    rand.shuffle(colors)
    # === 1. Generate data
    x, y = datasets.make_moons(n_samples=n_samples,
                               shuffle=True,
                               noise=.05,
                               random_state=rand.randint(1e8))
    x_min = np.min(x, 0, keepdims=True)
    x_max = np.max(x, 0, keepdims=True)
    x = (x - x_min) / (x_max - x_min) * 2. - 1.

    # === 2. Helper
    dpi = 200
    cmap = plt.get_cmap('coolwarm')

    def create_image(ids: List[int]):
        all_x = []
        all_y = []
        for i in ids:
            fig = plt.figure(figsize=(image_size / dpi, image_size / dpi),
                             dpi=dpi,
                             facecolor="black",
                             frameon=True)
            ax = plt.gca()
            ax.set_facecolor('black')
            ax.scatter(x[i, 0],
                       x[i, 1],
                       s=marker_size,
                       marker=shapes[i],
                       color=cmap(colors[i]),
                       antialiased=True,
                       edgecolors='none')
            ax.set_xlim([-1.2, 1.2])
            ax.set_ylim([-1.2, 1.2])
            ax.axis('off')
            ax.margins(0)
            fig.tight_layout(pad=0)
            # convert to array
            fig.canvas.draw()
            img = np.frombuffer(fig.canvas.tostring_rgb(), np.uint8)
            img = np.reshape(img, (image_size, image_size, 3))
            # img = np.asarray(fig.canvas.buffer_rgba())[:, :, :3]
            plt.close(fig)
            # save data
            all_x.append(np.expand_dims(img, 0))
            all_y.append([
                x[i, 0], x[i, 1], y[i], colors[i] * 2. - 1.,
                shapes_to_idx[shapes[i]]
            ])
        return np.concatenate(all_x, 0), np.vstack(all_y)

    # === 2. Generate images
    jobs = list(range(n_samples))
    progress = tqdm(total=n_samples, unit='images')
    X = []
    Y = []
    for img, lab in MPI(jobs, create_image, ncpu=n_cpu, batch=100):
        progress.update(img.shape[0])
        X.append(img)
        Y.append(lab)
    progress.clear()
    progress.close()
    return np.concatenate(X, 0), np.concatenate(Y, 0)
Example #8
0
def read_dataset10x(name,
                    filtered_cells=True,
                    filtered_genes=True,
                    override=False,
                    verbose=True) -> SingleCellOMIC:
    r""" Predefined procedure for download and preprocessing 10x dataset into
  `SingleCellOMIC` i.e. scanpy.AnnData object

  Reference:
    https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html

  """
    ### prepare the URL
    name = str(name).lower().strip()
    spec = 'filtered' if filtered_cells else 'raw'
    flatten_datasets = [(exp, version, dsname)
                        for exp, i in all_datasets.items()
                        for version, j in i.items() for dsname in j]
    found = []
    for exp, version, dsname in flatten_datasets:
        if name == dsname:
            found.append((exp, version, dsname))
    if not found:
        raise ValueError(f"Cannot find data with name {name}, "
                         f"all available datasets are: {flatten_datasets}")
    if len(found) > 1:
        raise RuntimeError(
            f"Found multiple datasets {found} with name='{name}'")
    exp, version, name = found[0]
    dataset_name = name + '_' + spec
    url = group_to_url_skeleton[exp][version].format(version, name, name, spec)
    ### prepare the output path
    filename = os.path.basename(url)
    # download path
    download_path = os.path.join(DOWNLOAD_DIR, exp, version)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    # preprocessing path
    preprocessed_path = os.path.join(DATA_DIR,
                                     f'10x_{exp}_{name}_{spec}_preprocessed')
    if override and os.path.exists(preprocessed_path):
        if verbose:
            print("Overriding path: %s" % preprocessed_path)
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        if verbose:
            print("Dataset10X:")
            print(" Meta       :", found)
            print(" File       :", filename)
            print(" URL        :", url)
            print(" Download   :", download_path)
            print(" Preprocess :", preprocessed_path)
        ### download the tar file
        path = download_file(url=url,
                             filename=os.path.join(download_path, filename),
                             override=False,
                             md5=_MD5.get(f"{exp}*{version}*{name}*{spec}",
                                          None))
        if not tarfile.is_tarfile(path):
            raise RuntimeError("Expecting tarfile but received: %s" % path)
        contents = {}
        with tarfile.open(path, mode="r:gz") as f:
            all_files = [(path, info.name, info.size, verbose) for info in f
                         if info.isfile()]
        for name, data in MPI(jobs=all_files,
                              func=_read_tarinfo,
                              batch=1,
                              ncpu=4):
            contents[name] = data
        # cell barcodes
        barcodes = contents['barcodes']
        ### cell-atac
        if exp == 'cell-atac':
            n_top_genes = 20000  # this is ad-hoc value
            X = contents['matrix'].T.todense()
            peaks = contents['peaks']
            X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype(
                np.float32)
            X_col_name = np.array([':'.join(i) for i in peaks])
            save_data = [(OMIC.atac.name, X)]
            save_metadata = dict(main_omic=OMIC.atac.name,
                                 barcodes=barcodes,
                                 chromatin_var=X_col_name)
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.atac,
                                 name=name)
        ### cell-exp and cell-vdj
        elif exp in ('cell-exp', 'cell-vdj'):
            n_top_genes = 2000
            # feature (Id, Name, Type(antibody or gene-expression))
            X_col = contents[
                'features'] if 'features' in contents else contents['genes']
            # data matrix
            X = contents['matrix'].T
            if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'):
                X = X.tocsr()
            X = X.astype('float32')
            assert X.shape[0] == barcodes.shape[0] and X.shape[
                1] == X_col.shape[0]
            # antibody and gene are provided
            prot_ids = []
            pmhc_ids = []
            gene_ids = []
            if X_col.shape[1] == 3:
                for idx, (feat_id, feat_name, feat_type) in enumerate(X_col):
                    if feat_type == 'Antibody Capture':
                        if exp == "cell-vdj" and "_TotalSeqC" not in feat_name:
                            pmhc_ids.append(idx)
                        else:
                            prot_ids.append(idx)
                    elif feat_type == 'Gene Expression':
                        gene_ids.append(idx)
                    else:
                        raise ValueError(
                            f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}"
                        )
            elif X_col.shape[1] == 2:
                gene_ids = slice(None, None)
            else:
                raise ValueError(f"No support for features matrix\n{X_col}")
            # Antibody ID, Antibody Name
            y = X[:, prot_ids]
            y_col = X_col[prot_ids][:, 0]  # the id
            y_col_name = X_col[prot_ids][:, 1]  # the name
            # pMHC peptide
            if len(pmhc_ids) > 0:
                z = X[:, pmhc_ids]
                z_col = X_col[pmhc_ids][:, 0]  # the id
                z_col_name = X_col[pmhc_ids][:, 1]  # the name
            # Gene ID, Gene Name
            X = X[:, gene_ids].todense()
            X_col_name = X_col[gene_ids][:, 1]  # the name
            X_col = X_col[gene_ids][:, 0]  # the id
            assert np.min(X) >= 0 and np.max(X) < 65000, \
              f"Only support uint16 data type, given data with max={np.max(X)}"
            # data and metadata
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.transcriptomic,
                                 name=name)
            save_data = [(OMIC.transcriptomic.name, X),
                         (OMIC.proteomic.name, y)]
            save_metadata = {
                'main_omic': OMIC.transcriptomic.name,
                'barcodes': barcodes,
                f"{OMIC.transcriptomic.name}_var": X_col_name,
                f"{OMIC.proteomic.name}_var": y_col_name
            }
            if len(pmhc_ids) > 0:
                save_data.append((OMIC.pmhc.name, z))
                save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name
        ### others
        else:
            raise NotImplementedError(f"No support for experiment: {exp}")
        ### save data and metadata
        for name, data in save_data:
            outpath = os.path.join(preprocessed_path, name)
            n_samples, n_features = data.shape
            if n_samples == 0 or n_features == 0:
                continue
            with MmapArrayWriter(outpath,
                                 shape=(0, n_features),
                                 dtype=np.uint16,
                                 remove_exist=True) as f:
                if verbose:
                    prog = tqdm(f"Saving {outpath}",
                                total=n_samples,
                                unit='samples')
                for s, e in batching(batch_size=5120, n=n_samples):
                    x = data[s:e]
                    if hasattr(x, 'todense'):
                        x = x.todense()
                    f.write(x)
                    if verbose:
                        prog.update(e - s)
                if verbose:
                    prog.clear()
                    prog.close()
        # save metadata
        outpath = os.path.join(preprocessed_path, 'metadata')
        with open(outpath, 'wb') as f:
            pickle.dump(save_metadata, f)
        if verbose:
            print(f"Saved metadata to path {outpath}")
        ### filter genes, follow 10x and use Cell Ranger recipe,
        # this is copied from Scanpy
        n_genes = sco.shape[1]
        sc.pp.filter_genes(sco, min_counts=1)
        # normalize with total UMI count per cell
        sc.pp.normalize_total(sco, key_added='n_counts_all')
        filter_result = sc.pp.filter_genes_dispersion(sco.X,
                                                      flavor='cell_ranger',
                                                      n_top_genes=n_top_genes,
                                                      log=False)
        gene_subset = filter_result.gene_subset
        indices = sco.get_var_indices()
        markers = (MARKER_GENES
                   if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC)
        for name in markers:
            idx = indices.get(name, None)
            if idx is not None:
                gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)  # filter genes
        if verbose:
            print(
                f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.")
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(sco.var_names.values, f)
    # ******************** load and return the dataset ******************** #
    omics = [
        name for name in os.listdir(preprocessed_path)
        if name not in ('metadata', 'top_genes') and '_' not in name
    ]
    with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f:
        metadata = pickle.load(f)
    with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
        top_genes = pickle.load(f)
    data = {
        name: MmapArray(os.path.join(preprocessed_path,
                                     name)).astype(np.float32)
        for name in omics
    }
    main_omic = metadata['main_omic']
    X = data[main_omic]
    var_names = metadata[f'{main_omic}_var']
    if filtered_genes:
        var_ids = {j: i for i, j in enumerate(var_names)}
        ids = [var_ids[i] for i in top_genes]
        X = X[:, ids]
        var_names = var_names[ids]
    sco = SingleCellOMIC(
        X,
        cell_id=metadata['barcodes'],
        gene_id=var_names,
        omic=main_omic,
        name=f"{dataset_name}{'' if filtered_genes else 'all'}")
    for o in omics:
        if o != main_omic:
            sco.add_omic(omic=o,
                         X=data[o],
                         var_names=np.asarray(metadata[f'{o}_var']))
    return sco
def read_leukemia_MixedPhenotypes(filtered_genes=True,
                                  omic='rna',
                                  ignore_na=True,
                                  override=False,
                                  verbose=True) -> SingleCellOMIC:
  r""" Integrates highly multiplexed protein quantification, transcriptome
  profiling, and chromatin accessibility analysis. Using this approach,
  we establish a normal epigenetic baseline for healthy blood development,
  which we then use to deconvolve aberrant molecular features within blood
  from mixed-phenotype acute leukemia (MPAL) patients.

  scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow,
  peripheral blood, and MPAL donors

  References:
    Granja JM et al., 2019. "Single-cell multiomic analysis identifies
      regulatory  programs in mixed-phenotype acute leukemia".
      Nature Biotechnology.
    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369
    https://github.com/GreenleafLab/MPAL-Single-Cell-2019
  """
  ### prepare the path
  download_dir = os.path.join(DOWNLOAD_DIR, 'mpal')
  if not os.path.exists(download_dir):
    os.makedirs(download_dir)
  preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed')
  if override:
    shutil.rmtree(preprocessed_path)
    if verbose:
      print(f"Override preprocessed data at {preprocessed_path}")
  if not os.path.exists(preprocessed_path):
    os.makedirs(preprocessed_path)
  ### download
  files = {}
  for name, (url, md5) in _URL.items():
    path = download_file(url=url,
                         filename=os.path.join(download_dir,
                                               os.path.basename(url)),
                         override=False,
                         md5=md5)
    files[name] = path
  ### read the files
  if omic == 'atac':
    del files['rna']
    del files['adt']
  elif omic == 'rna':
    del files['atac']
  else:
    raise NotImplementedError(f"No support for omic type: {omic}")
  all_data = {}
  for name, data in MPI(jobs=list(files.items()),
                        func=partial(_read_data,
                                     verbose=True,
                                     preprocessed_path=preprocessed_path),
                        batch=1,
                        ncpu=4):
    all_data[name] = data.load()
  ### load scRNA and ADT
  if omic == 'rna':
    rna = all_data['rna']
    adt = all_data['adt']
    cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode']))
    #
    barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])}
    ids = [barcode2ids[i] for i in cell_id]
    X_rna = rna.X[ids].astype(np.float32)
    classification = rna.celldata['ProjectClassification'][ids].values
    #
    barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])}
    X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32)
    #
    if filtered_genes:
      top_genes_path = os.path.join(preprocessed_path, 'top_genes')
      if os.path.exists(top_genes_path):
        with open(top_genes_path, 'rb') as f:
          top_genes = set(pickle.load(f))
        ids = [i for i, j in enumerate(rna.genenames) if j in top_genes]
        sco = SingleCellOMIC(X_rna[:, ids],
                             cell_id=cell_id,
                             gene_id=rna.genenames[ids],
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
      else:
        sco = SingleCellOMIC(X_rna,
                             cell_id=cell_id,
                             gene_id=rna.genenames,
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        # make sure all marker genes are included
        gene_subset = result.gene_subset
        gene_indices = sco.get_var_indices()
        for gene in MARKER_GENES:
          idx = gene_indices.get(gene, None)
          if idx is not None:
            gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)
        with open(top_genes_path, 'wb') as f:
          pickle.dump(sco.var_names.values, f)
    else:
      sco = SingleCellOMIC(X_rna,
                           cell_id=cell_id,
                           gene_id=rna.genenames,
                           omic=OMIC.transcriptomic,
                           name='mpalRNAall')
    # loading dataset
    if ignore_na:
      ids = np.logical_not(np.isnan(np.max(X_adt, axis=0)))
      sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids])
    else:
      sco.add_omic(OMIC.proteomic, X_adt, adt.genenames)
    y, labels = _celltypes(classification)
    sco.add_omic(OMIC.celltype, y, labels)
    exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values}
    sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names],
                                     dtype=np.float32)
  ### load ATAC
  else:
    atac = all_data['atac']
    sco = SingleCellOMIC(atac.X.astype(np.float32),
                         cell_id=atac.celldata['Barcode'],
                         gene_id=atac.genenames,
                         omic=OMIC.atac,
                         name='mpalATAC')
    y, labels = _celltypes(atac.celldata['ProjectClassification'].values)
    sco.add_omic(OMIC.celltype, y, labels)
    sco.obs['clusters'] = atac.celldata['Clusters'].values
    sco.var['score'] = atac.genedata['score'].values
  return sco