Ejemplo n.º 1
0
def read_PBMC_crossdataset(name,
                           filtered_genes=True,
                           override=False,
                           verbose=True) -> SingleCellOMIC:
    r""" This create a dataset with shared genes among multiple datasets

    - 'pbmc8k' (6290, 17870)->(6290, 11299) genes
    - 'pbmcecc' (2941, 15634)->(2941, 11299) genes
    - 'pbmcciteseq' (7985, 17006)->(7985, 11299) genes
    - 'cbmcciteseq' (8617, 20400)->(8617, 11299) genes
    - 'call' (37552, 33694)->(37552, 11299) genes
    - 'mpal' (52396, 20287)->(52396, 11299) genes
    - 'pbmc5k' (5247, 33538)->(5247, 11299) genes
    - 'vdj1' (55206, 33538)->(55206, 11299) genes
    - 'vdj4' (36619, 33538)->(36619, 11299) genes

  Total transcriptomic data: 212853(cells) 11299(genes)

  Highly variable genes: 2000

  Arguments:
    name : {'pbmc8k', 'pbmcecc', 'call', 'mpal', 'pbmc5k', 'vdj1', 'vdj4'}
  """
    assert name in _DATASETS, \
      (f"Invalid dataset name='{name}', "
       f"available datasets are: {list(_DATASETS.keys())}")
    preprocessed_path = os.path.join(DATA_DIR,
                                     'PBMC_crossdataset_preprocessed')
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at path {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessing ******************** #
    if len(os.listdir(preprocessed_path)) == 0 or \
      md5_folder(preprocessed_path) != _MD5:
        datasets = {}
        for i, j in _DATASETS.items():
            ds = j(verbose=verbose)
            datasets[i] = ds
            if verbose:
                print(f"Read dataset='{i}' shape={ds.shape}")
        gene_names = sorted(
            reduce(lambda x, y: x & y,
                   (set(i.var_names.values) for i in datasets.values())))
        # this make sure the gene order is random and consistent among all machines
        rand = np.random.RandomState(seed=1)
        rand.shuffle(gene_names)
        # some debugging
        if verbose:
            omics = reduce(lambda x, y: x | y,
                           (i.omics for i in datasets.values()))
            n_samples = {k: v.shape[0] for k, v in datasets.items()}
            print(f"Select {len(gene_names)} common genes "
                  f"among {', '.join(datasets.keys())}.")
            print(f"All available OMICs are: {omics}")
            print(f"Amount of samples: {n_samples}")
        # read data from all available OMICs
        indices = {}
        mRNA = []
        for name, sco in datasets.items():
            X, ids = _match_genes(sco, gene_names)
            indices[name] = ids
            mRNA.append(X)
            if verbose:
                print(f"Matching genes for dataset '{name}' "
                      f"{sco.X.shape}->{X.shape} genes")
        mRNA = np.concatenate(mRNA, axis=0)
        if verbose:
            print("Total transcriptomic data:",
                  f"{mRNA.shape[0]}(cells) {mRNA.shape[1]}(genes)")
        # filter genes seurat
        sco = SingleCellOMIC(mRNA, gene_id=gene_names)
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        gene_subset = result.gene_subset
        # maker sure all marker genes included
        for i, gene in enumerate(gene_names):
            if gene in MARKER_GENES:
                gene_subset[i] = True
        sco._inplace_subset_var(gene_subset)
        top_genes = set(sco.var_names.values)
        if verbose:
            print(f"Filtered highly variable genes: {len(top_genes)}")
        del sco
        # save the indices and top_genes
        with open(os.path.join(preprocessed_path, 'gene_indices'), 'wb') as f:
            pickle.dump([gene_names, indices, top_genes], f)
        print(f"Preprocessed MD5: {md5_folder(preprocessed_path)}")
    # ******************** load the dataset ******************** #
    with open(os.path.join(preprocessed_path, 'gene_indices'), 'rb') as f:
        gene_names, indices, top_genes = pickle.load(f)
    sco = _DATASETS[name](verbose=verbose)
    sco._inplace_subset_var(indices[name])
    if filtered_genes:
        top_indices = [i in top_genes for i in sco.var_names]
        sco._inplace_subset_var(top_indices)
    sco._name += 'x'
    return sco
Ejemplo n.º 2
0
def read_dataset10x(name,
                    filtered_cells=True,
                    filtered_genes=True,
                    override=False,
                    verbose=True) -> SingleCellOMIC:
    r""" Predefined procedure for download and preprocessing 10x dataset into
  `SingleCellOMIC` i.e. scanpy.AnnData object

  Reference:
    https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html

  """
    ### prepare the URL
    name = str(name).lower().strip()
    spec = 'filtered' if filtered_cells else 'raw'
    flatten_datasets = [(exp, version, dsname)
                        for exp, i in all_datasets.items()
                        for version, j in i.items() for dsname in j]
    found = []
    for exp, version, dsname in flatten_datasets:
        if name == dsname:
            found.append((exp, version, dsname))
    if not found:
        raise ValueError(f"Cannot find data with name {name}, "
                         f"all available datasets are: {flatten_datasets}")
    if len(found) > 1:
        raise RuntimeError(
            f"Found multiple datasets {found} with name='{name}'")
    exp, version, name = found[0]
    dataset_name = name + '_' + spec
    url = group_to_url_skeleton[exp][version].format(version, name, name, spec)
    ### prepare the output path
    filename = os.path.basename(url)
    # download path
    download_path = os.path.join(DOWNLOAD_DIR, exp, version)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    # preprocessing path
    preprocessed_path = os.path.join(DATA_DIR,
                                     f'10x_{exp}_{name}_{spec}_preprocessed')
    if override and os.path.exists(preprocessed_path):
        if verbose:
            print("Overriding path: %s" % preprocessed_path)
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        if verbose:
            print("Dataset10X:")
            print(" Meta       :", found)
            print(" File       :", filename)
            print(" URL        :", url)
            print(" Download   :", download_path)
            print(" Preprocess :", preprocessed_path)
        ### download the tar file
        path = download_file(url=url,
                             filename=os.path.join(download_path, filename),
                             override=False,
                             md5=_MD5.get(f"{exp}*{version}*{name}*{spec}",
                                          None))
        if not tarfile.is_tarfile(path):
            raise RuntimeError("Expecting tarfile but received: %s" % path)
        contents = {}
        with tarfile.open(path, mode="r:gz") as f:
            all_files = [(path, info.name, info.size, verbose) for info in f
                         if info.isfile()]
        for name, data in MPI(jobs=all_files,
                              func=_read_tarinfo,
                              batch=1,
                              ncpu=4):
            contents[name] = data
        # cell barcodes
        barcodes = contents['barcodes']
        ### cell-atac
        if exp == 'cell-atac':
            n_top_genes = 20000  # this is ad-hoc value
            X = contents['matrix'].T.todense()
            peaks = contents['peaks']
            X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype(
                np.float32)
            X_col_name = np.array([':'.join(i) for i in peaks])
            save_data = [(OMIC.atac.name, X)]
            save_metadata = dict(main_omic=OMIC.atac.name,
                                 barcodes=barcodes,
                                 chromatin_var=X_col_name)
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.atac,
                                 name=name)
        ### cell-exp and cell-vdj
        elif exp in ('cell-exp', 'cell-vdj'):
            n_top_genes = 2000
            # feature (Id, Name, Type(antibody or gene-expression))
            X_col = contents[
                'features'] if 'features' in contents else contents['genes']
            # data matrix
            X = contents['matrix'].T
            if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'):
                X = X.tocsr()
            X = X.astype('float32')
            assert X.shape[0] == barcodes.shape[0] and X.shape[
                1] == X_col.shape[0]
            # antibody and gene are provided
            prot_ids = []
            pmhc_ids = []
            gene_ids = []
            if X_col.shape[1] == 3:
                for idx, (feat_id, feat_name, feat_type) in enumerate(X_col):
                    if feat_type == 'Antibody Capture':
                        if exp == "cell-vdj" and "_TotalSeqC" not in feat_name:
                            pmhc_ids.append(idx)
                        else:
                            prot_ids.append(idx)
                    elif feat_type == 'Gene Expression':
                        gene_ids.append(idx)
                    else:
                        raise ValueError(
                            f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}"
                        )
            elif X_col.shape[1] == 2:
                gene_ids = slice(None, None)
            else:
                raise ValueError(f"No support for features matrix\n{X_col}")
            # Antibody ID, Antibody Name
            y = X[:, prot_ids]
            y_col = X_col[prot_ids][:, 0]  # the id
            y_col_name = X_col[prot_ids][:, 1]  # the name
            # pMHC peptide
            if len(pmhc_ids) > 0:
                z = X[:, pmhc_ids]
                z_col = X_col[pmhc_ids][:, 0]  # the id
                z_col_name = X_col[pmhc_ids][:, 1]  # the name
            # Gene ID, Gene Name
            X = X[:, gene_ids].todense()
            X_col_name = X_col[gene_ids][:, 1]  # the name
            X_col = X_col[gene_ids][:, 0]  # the id
            assert np.min(X) >= 0 and np.max(X) < 65000, \
              f"Only support uint16 data type, given data with max={np.max(X)}"
            # data and metadata
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.transcriptomic,
                                 name=name)
            save_data = [(OMIC.transcriptomic.name, X),
                         (OMIC.proteomic.name, y)]
            save_metadata = {
                'main_omic': OMIC.transcriptomic.name,
                'barcodes': barcodes,
                f"{OMIC.transcriptomic.name}_var": X_col_name,
                f"{OMIC.proteomic.name}_var": y_col_name
            }
            if len(pmhc_ids) > 0:
                save_data.append((OMIC.pmhc.name, z))
                save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name
        ### others
        else:
            raise NotImplementedError(f"No support for experiment: {exp}")
        ### save data and metadata
        for name, data in save_data:
            outpath = os.path.join(preprocessed_path, name)
            n_samples, n_features = data.shape
            if n_samples == 0 or n_features == 0:
                continue
            with MmapArrayWriter(outpath,
                                 shape=(0, n_features),
                                 dtype=np.uint16,
                                 remove_exist=True) as f:
                if verbose:
                    prog = tqdm(f"Saving {outpath}",
                                total=n_samples,
                                unit='samples')
                for s, e in batching(batch_size=5120, n=n_samples):
                    x = data[s:e]
                    if hasattr(x, 'todense'):
                        x = x.todense()
                    f.write(x)
                    if verbose:
                        prog.update(e - s)
                if verbose:
                    prog.clear()
                    prog.close()
        # save metadata
        outpath = os.path.join(preprocessed_path, 'metadata')
        with open(outpath, 'wb') as f:
            pickle.dump(save_metadata, f)
        if verbose:
            print(f"Saved metadata to path {outpath}")
        ### filter genes, follow 10x and use Cell Ranger recipe,
        # this is copied from Scanpy
        n_genes = sco.shape[1]
        sc.pp.filter_genes(sco, min_counts=1)
        # normalize with total UMI count per cell
        sc.pp.normalize_total(sco, key_added='n_counts_all')
        filter_result = sc.pp.filter_genes_dispersion(sco.X,
                                                      flavor='cell_ranger',
                                                      n_top_genes=n_top_genes,
                                                      log=False)
        gene_subset = filter_result.gene_subset
        indices = sco.get_var_indices()
        markers = (MARKER_GENES
                   if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC)
        for name in markers:
            idx = indices.get(name, None)
            if idx is not None:
                gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)  # filter genes
        if verbose:
            print(
                f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.")
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(sco.var_names.values, f)
    # ******************** load and return the dataset ******************** #
    omics = [
        name for name in os.listdir(preprocessed_path)
        if name not in ('metadata', 'top_genes') and '_' not in name
    ]
    with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f:
        metadata = pickle.load(f)
    with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
        top_genes = pickle.load(f)
    data = {
        name: MmapArray(os.path.join(preprocessed_path,
                                     name)).astype(np.float32)
        for name in omics
    }
    main_omic = metadata['main_omic']
    X = data[main_omic]
    var_names = metadata[f'{main_omic}_var']
    if filtered_genes:
        var_ids = {j: i for i, j in enumerate(var_names)}
        ids = [var_ids[i] for i in top_genes]
        X = X[:, ids]
        var_names = var_names[ids]
    sco = SingleCellOMIC(
        X,
        cell_id=metadata['barcodes'],
        gene_id=var_names,
        omic=main_omic,
        name=f"{dataset_name}{'' if filtered_genes else 'all'}")
    for o in omics:
        if o != main_omic:
            sco.add_omic(omic=o,
                         X=data[o],
                         var_names=np.asarray(metadata[f'{o}_var']))
    return sco
Ejemplo n.º 3
0
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True):
    download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = _CITEseq_CBMC_PREPROCESSED
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        if verbose:
            print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED)
        shutil.rmtree(_CITEseq_CBMC_PREPROCESSED)
        os.mkdir(_CITEseq_CBMC_PREPROCESSED)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8')
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        download_file(filename=zip_path,
                      url=url,
                      override=False,
                      md5=r"beb76d01a67707c61c21bfb188e1b69f")
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
        # ====== post-processing ====== #
        X = np.array(data_dict['X'].astype('float32'))
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        X, X_col = remove_allzeros_columns(matrix=X, colname=X_col)
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]
        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]
        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"
        # save data
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
        sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col)
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        sco._inplace_subset_var(result.gene_subset)
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(set(sco.var_names.values), f)
        del sco
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(
        X=ds['X'],
        cell_id=ds['X_row'],
        gene_id=ds['X_col'],
        omic='transcriptomic',
        name=f"cbmcCITEseq{'' if filtered_genes else 'all'}",
    ).add_omic('proteomic', ds['y'], ds['y_col'])
    if filtered_genes:
        with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
            top_genes = pickle.load(f)
        sco._inplace_subset_var([i in top_genes for i in sco.var_names])
    return sco
def read_leukemia_MixedPhenotypes(filtered_genes=True,
                                  omic='rna',
                                  ignore_na=True,
                                  override=False,
                                  verbose=True) -> SingleCellOMIC:
  r""" Integrates highly multiplexed protein quantification, transcriptome
  profiling, and chromatin accessibility analysis. Using this approach,
  we establish a normal epigenetic baseline for healthy blood development,
  which we then use to deconvolve aberrant molecular features within blood
  from mixed-phenotype acute leukemia (MPAL) patients.

  scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow,
  peripheral blood, and MPAL donors

  References:
    Granja JM et al., 2019. "Single-cell multiomic analysis identifies
      regulatory  programs in mixed-phenotype acute leukemia".
      Nature Biotechnology.
    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369
    https://github.com/GreenleafLab/MPAL-Single-Cell-2019
  """
  ### prepare the path
  download_dir = os.path.join(DOWNLOAD_DIR, 'mpal')
  if not os.path.exists(download_dir):
    os.makedirs(download_dir)
  preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed')
  if override:
    shutil.rmtree(preprocessed_path)
    if verbose:
      print(f"Override preprocessed data at {preprocessed_path}")
  if not os.path.exists(preprocessed_path):
    os.makedirs(preprocessed_path)
  ### download
  files = {}
  for name, (url, md5) in _URL.items():
    path = download_file(url=url,
                         filename=os.path.join(download_dir,
                                               os.path.basename(url)),
                         override=False,
                         md5=md5)
    files[name] = path
  ### read the files
  if omic == 'atac':
    del files['rna']
    del files['adt']
  elif omic == 'rna':
    del files['atac']
  else:
    raise NotImplementedError(f"No support for omic type: {omic}")
  all_data = {}
  for name, data in MPI(jobs=list(files.items()),
                        func=partial(_read_data,
                                     verbose=True,
                                     preprocessed_path=preprocessed_path),
                        batch=1,
                        ncpu=4):
    all_data[name] = data.load()
  ### load scRNA and ADT
  if omic == 'rna':
    rna = all_data['rna']
    adt = all_data['adt']
    cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode']))
    #
    barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])}
    ids = [barcode2ids[i] for i in cell_id]
    X_rna = rna.X[ids].astype(np.float32)
    classification = rna.celldata['ProjectClassification'][ids].values
    #
    barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])}
    X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32)
    #
    if filtered_genes:
      top_genes_path = os.path.join(preprocessed_path, 'top_genes')
      if os.path.exists(top_genes_path):
        with open(top_genes_path, 'rb') as f:
          top_genes = set(pickle.load(f))
        ids = [i for i, j in enumerate(rna.genenames) if j in top_genes]
        sco = SingleCellOMIC(X_rna[:, ids],
                             cell_id=cell_id,
                             gene_id=rna.genenames[ids],
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
      else:
        sco = SingleCellOMIC(X_rna,
                             cell_id=cell_id,
                             gene_id=rna.genenames,
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        # make sure all marker genes are included
        gene_subset = result.gene_subset
        gene_indices = sco.get_var_indices()
        for gene in MARKER_GENES:
          idx = gene_indices.get(gene, None)
          if idx is not None:
            gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)
        with open(top_genes_path, 'wb') as f:
          pickle.dump(sco.var_names.values, f)
    else:
      sco = SingleCellOMIC(X_rna,
                           cell_id=cell_id,
                           gene_id=rna.genenames,
                           omic=OMIC.transcriptomic,
                           name='mpalRNAall')
    # loading dataset
    if ignore_na:
      ids = np.logical_not(np.isnan(np.max(X_adt, axis=0)))
      sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids])
    else:
      sco.add_omic(OMIC.proteomic, X_adt, adt.genenames)
    y, labels = _celltypes(classification)
    sco.add_omic(OMIC.celltype, y, labels)
    exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values}
    sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names],
                                     dtype=np.float32)
  ### load ATAC
  else:
    atac = all_data['atac']
    sco = SingleCellOMIC(atac.X.astype(np.float32),
                         cell_id=atac.celldata['Barcode'],
                         gene_id=atac.genenames,
                         omic=OMIC.atac,
                         name='mpalATAC')
    y, labels = _celltypes(atac.celldata['ProjectClassification'].values)
    sco.add_omic(OMIC.celltype, y, labels)
    sco.obs['clusters'] = atac.celldata['Clusters'].values
    sco.var['score'] = atac.genedata['score'].values
  return sco