Example #1
0
def read_scale_dataset(dsname="leukemia",
                       filtered_genes=True,
                       override=False,
                       verbose=True) -> SingleCellOMIC:
    r""" Datasets provided by (Xiong et al. 2019), four datasets are supported:

    - 'breast_tumor'
    - 'forebrain'
    - 'leukemia'
    - 'insilico'

  Reference:
    Xiong, L. et al. SCALE method for single-cell ATAC-seq analysis via latent
      feature extraction. Nat Commun 10, 4576 (2019).

  """
    datasets = {'breast_tumor', 'forebrain', 'leukemia', 'insilico'}
    assert dsname in datasets, \
      f"Cannot find dataset with name {dsname}, available datasets are: {datasets}"
    download_path = os.path.join(DOWNLOAD_DIR, f"scale_dataset")
    preprocessed_path = os.path.join(DATA_DIR, f"scale_preprocessed")
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### Download data
    url = str(base64.decodebytes(_URL), 'utf-8')
    path = os.path.join(download_path, os.path.basename(url))
    download_file(url, path, override=False, md5=_MD5)
    ### extract the data
    if len(os.listdir(preprocessed_path)) == 0:
        with zipfile.ZipFile(path, "r") as f:
            for info in f.filelist:
                name = os.path.basename(info.filename)
                if len(name) == 0:
                    continue
                with open(os.path.join(preprocessed_path, name), 'wb') as fout:
                    fout.write(f.read(info))
    ### load the data
    cell = np.load(os.path.join(preprocessed_path, f"{dsname}_cell"))
    labels = np.load(os.path.join(preprocessed_path, f"{dsname}_labels"))
    peak = np.load(os.path.join(preprocessed_path, f"{dsname}_peak"))
    x = sparse.load_npz(os.path.join(preprocessed_path, f"{dsname}_x"))
    sco = SingleCellOMIC(X=x,
                         cell_id=cell,
                         gene_id=peak,
                         omic=OMIC.atac,
                         name=dsname)
    ids = {key: i for i, key in enumerate(sorted(set(labels)))}
    sco.add_omic(OMIC.celltype,
                 X=one_hot(np.array([ids[i] for i in labels]), len(ids)),
                 var_names=list(ids.keys()))
    return sco
Example #2
0
def read_mouse_ATLAS(filtered_genes=True,
                     override=False,
                     verbose=True) -> SingleCellOMIC:
    r""" sci-ATAC-seq, to profile genome-wide chromatin accessibility in ∼100,000
  single cells from 13 adult mouse tissues:

    - The regulatory landscape of adult mouse tissues mapped by single-cell
      chromatin assay
    - Characterization of 85 distinct chromatin patterns across 13 different
      tissues
    - Annotation of key regulators and regulatory sequences in diverse
      mammalian cell types
    - Dataset allows resolution of cell types underlying common human traits
      and diseases

  References:
    Cusanovich, D. A. et al. A Single-Cell Atlas of In Vivo Mammalian Chromatin
      Accessibility. Cell 174, 1309-1324.e18 (2018).
    Link https://atlas.gs.washington.edu/mouse-atac/
  """
    download_path = os.path.join(DOWNLOAD_DIR, f"mouse_atac")
    preprocessed_path = os.path.join(DATA_DIR, f"mouse_atac_preprocessed")
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### Download data
    files = {}
    for name, (url, md5) in _URLs.items():
        filepath = os.path.join(download_path, os.path.basename(url))
        files[name] = download_file(url, filepath, override=False, md5=md5)
    ### save counts matrix
    path = os.path.join(preprocessed_path, 'counts')
    if not os.path.exists(path):
        print("Reading counts matrix ...")
        counts = mmread(files['counts'])
        counts: sparse.coo_matrix
        counts = counts.astype(np.unit8)
        with open(path, 'wb') as f:
            sparse.save_npz(f, counts, compressed=False)
    ### save metadata
    path = os.path.join(preprocessed_path, 'metadata')
    if not os.path.exists(path):
        with open(files['cellids'], 'r') as f:
            cell = np.array([i for i in f.read().split('\n') if len(i) > 0])
        with open(files['peakids'], 'r') as f:
            peak = np.array([i for i in f.read().split('\n') if len(i) > 0])
        metadata = pd.read_csv(files['metadata'], sep="\t")
        assert metadata.shape[0] == len(cell)
        tissue = metadata['tissue'].to_numpy()
        celltype = metadata['cell_label'].to_numpy()
        with open(path, 'wb') as f:
            np.savez(f, cell=cell, peak=peak, tissue=tissue, celltype=celltype)
    ### Read all data and create SCO
    counts = sparse.csr_matrix(
        sparse.load_npz(os.path.join(preprocessed_path, 'counts')))
    metadata = np.load(os.path.join(preprocessed_path, 'metadata'),
                       allow_pickle=True)
    cell = metadata['cell']
    peak = metadata['peak']
    tissue = metadata['tissue']
    celltype = metadata['celltype']
    # need to transpose here, counts matrix is [peaks, cells]
    sco = SingleCellOMIC(X=counts.T,
                         cell_id=cell,
                         gene_id=peak,
                         omic=OMIC.atac,
                         name="mouse_atlas")
    # add celltype
    labels = {name: i for i, name in enumerate(sorted(set(celltype)))}
    sco.add_omic(OMIC.celltype,
                 X=one_hot(np.array([labels[i] for i in celltype]),
                           len(labels)),
                 var_names=list(labels.keys()))
    # add tissue type
    labels = {name: i for i, name in enumerate(sorted(set(tissue)))}
    sco.add_omic(OMIC.tissue,
                 X=one_hot(np.array([labels[i] for i in tissue]), len(labels)),
                 var_names=list(labels.keys()))
    return sco
Example #3
0
def read_PBMCeec(subset='ly',
                 override=False,
                 verbose=True,
                 filtered_genes=True) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    if subset in ('my', 'full'):
        raise NotImplementedError("No support for subset: %s - PBMCecc" %
                                  subset)
    download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset)
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at path {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        # ====== full ====== #
        if subset == 'full':
            raise NotImplementedError
        # ====== ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # ====== extract the data ====== #
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_var']
                X_col = data['X_var_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array(['ly'] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"ecc{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
def read_human_embryos(filtered_genes=True,
                       override=False,
                       verbose=True) -> SingleCellOMIC:
    r""" Transcriptional map of human embryo development, including the sequenced
    transcriptomes of 1529 individual cells from 88 human preimplantation
    embryos. These data show that cells undergo an intermediate state of
    co-expression of lineage-specific genes, followed by a concurrent
    establishment of the trophectoderm, epiblast, and primitive endoderm
    lineages, which coincide with blastocyst formation.

  References:
    Petropoulos S, Edsgärd D, Reinius B, et al. Single-Cell RNA-Seq Reveals
      Lineage and X Chromosome Dynamics in Human Preimplantation Embryos.
      Cell. 2016 Sep

  Note:
    Gene expression levels (RefSeq annotations) were estimated in terms of
      reads per kilobase exon model and per million mapped reads (RPKM)
      using rpkmforgenes
    Genes were filtered, keeping 15633/26178 genes that
      * were expressed in at least 5 out of 1919 sequenced cells (RPKM >= 10).
        and
      * for which cells with expression came from at least two
        different embryos.
    Cells were quality-filtered based on 4 criteria, keeping 1529/1919 cells.
      * First, Spearman correlations, using the RPKM expression levels of
        all genes, for every possible pair of cells were calculated and a
        histogram of the maximum correlation obtained for each cell,
        corresponding to the most similar cell, was used to identify 305
        outlier cells with a maximum pair-wise correlations below 0.63.
      * Second, a histogram of the number of expressed genes per cell was
        used to identify 330 outlier cells with less than 5000 expressed
        genes.
      * Third, a histogram of the total transcriptional expression output
        from the sex chromosomes (RPKM sum) was used to identify 33 cells
        with indeterminable sex, or a called sex that was inconsistent with
        other cells of that embryo
      * Fourth, 13 outlier cells were identified using PCA and t-SNE
        dimensionality reduction.

  """
    download_dir = os.path.join(DOWNLOAD_DIR, 'human_embryos')
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    preprocessed_path = os.path.join(DATA_DIR, 'human_embryos_preprocessed')
    if override:
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.makedirs(preprocessed_path)
    ### download data
    files = []
    for url, md5 in zip(_URLs, _MD5s):
        path = download_file(url=url,
                             filename=os.path.join(download_dir,
                                                   os.path.basename(url)),
                             override=False,
                             md5=md5)
        files.append(path)
    ### preprocessing
    if len(os.listdir(preprocessed_path)) == 0:
        data_map = {}
        for f in files:
            zipname = os.path.basename(f)
            with zipfile.ZipFile(f, mode="r") as f:
                for dat_file in f.filelist:
                    filename = dat_file.filename
                    dat = str(f.read(filename), 'utf-8')
                    x = []
                    for line in dat.split('\n'):
                        if len(line) == 0:
                            continue
                        line = line.split('\t')
                        x.append(line)
                    x = np.asarray(x).T
                    row_name = x[1:, 0]
                    col_name = x[0, 1:]
                    x = x[1:, 1:].astype(np.float32)
                    x = sparse.coo_matrix(x)
                    data_map[filename] = (x, row_name, col_name)
                    print(f"Read: {zipname} - {filename}")
                    print(f" * Matrix: {x.shape}")
                    print(f" * Row   : {row_name.shape}-{row_name[:3]}")
                    print(f" * Col   : {col_name.shape}-{col_name[:3]}")
        # save loaded data to disk
        for name, (x, row, col) in data_map.items():
            with open(os.path.join(preprocessed_path, f"{name}:x"), "wb") as f:
                sparse.save_npz(f, x)
            with open(os.path.join(preprocessed_path, f"{name}:row"),
                      "wb") as f:
                np.save(f, row)
            with open(os.path.join(preprocessed_path, f"{name}:col"),
                      "wb") as f:
                np.save(f, col)
        del data_map
    ### read the data
    # counts.txt (1529, 26178)
    # ercc.counts.txt (1529, 92)
    # rpkm.txt (1529, 26178)
    # ercc.rpkm.txt (1529, 92)
    data = {}
    genes_path = os.path.join(preprocessed_path, "filtered_genes")
    for path in os.listdir(preprocessed_path):
        if path == os.path.basename(genes_path):
            continue
        name, ftype = os.path.basename(path).split(':')
        with open(os.path.join(preprocessed_path, path), 'rb') as f:
            if ftype == 'x':
                x = sparse.load_npz(f).tocsr()
            else:
                x = np.load(f)
        data[f"{name}_{ftype}"] = x
    rpkm = data['rpkm.txt_x']
    counts = data['counts.txt_x']
    genes = data['counts.txt_col']
    cells = data['counts.txt_row']
    ### filter genes
    if not os.path.exists(genes_path):
        # filter genes by rpkm
        ids = np.asarray(np.sum(rpkm, axis=0) >= 10).ravel()
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
        # filter genes by min 5 cells
        ids = np.asarray(np.sum(counts > 0, axis=0) >= 5).ravel()
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
        # filter highly variable genes
        sco = SingleCellOMIC(X=counts, cell_id=cells, gene_id=genes)
        sco.normalize(omic=OMIC.transcriptomic, log1p=True)
        sco.filter_highly_variable_genes(n_top_genes=2000)
        filtered = sco.var_names.to_numpy()
        with open(genes_path, 'wb') as f:
            pickle.dump([genes, filtered], f)
        del sco
    else:
        with open(genes_path, 'rb') as f:
            ids, filtered = pickle.load(f)
        ids = set(ids)
        ids = np.asarray([i in ids for i in genes])
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
    # last filtering
    if filtered_genes:
        filtered = set(filtered)
        ids = np.asarray([i in filtered for i in genes])
        rpkm = rpkm[:, ids]
        counts = counts[:, ids]
        genes = genes[ids]
    ### create the SingleCellOMIC
    sco = SingleCellOMIC(X=counts,
                         cell_id=cells,
                         gene_id=genes,
                         omic=OMIC.transcriptomic,
                         name="HumanEmbryos")
    sco.add_omic(omic=OMIC.rpkm, X=rpkm, var_names=genes)
    labels = ['.'.join(i.split('.')[:-2]) for i in sco.obs_names]
    labels = ['E7' if i == 'E7.4' else i for i in labels]
    labels_name = {j: i for i, j in enumerate(sorted(set(labels)))}
    labels = np.array([labels_name[i] for i in labels])
    sco.add_omic(omic=OMIC.celltype,
                 X=one_hot(labels, len(labels_name)),
                 var_names=list(labels_name.keys()))
    sco.add_omic(omic=OMIC.ercc,
                 X=data['ercc.counts.txt_x'],
                 var_names=data['ercc.counts.txt_col'])
    return sco
Example #5
0
def read_PBMC8k(subset='full',
                override=False,
                verbose=True,
                filtered_genes=True,
                return_arrays=False) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    # prepare the path
    download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        # ====== pbmc 8k ====== #
        if subset == 'full':
            ly = read_PBMC8k('ly',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            my = read_PBMC8k('my',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # load data
            data = np.load(path)
            X = data['X']
            X_row = data['X_row']
            X_col = data['X_col'].tolist()
            y = data['y']
            y_col = data['y_col'].tolist()
            # merge all genes from my and ly subset
            all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist())
            all_genes = sorted([X_col.index(i) for i in all_genes])
            # same for protein
            all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist())
            all_proteins = sorted([y_col.index(i) for i in all_proteins])
            #
            X = X[:, all_genes]
            y = y[:, all_proteins]
            X_col = np.array(X_col)[all_genes]
            y_col = np.array(y_col)[all_proteins]
            cell_types = np.array(
                ['ly' if i in ly['X_row'] else 'my' for i in X_row])
        # ====== pbmc ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # extract the data
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_filt']
                X_col = data['X_filt_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array([subset] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    if return_arrays:
        return ds
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"8k{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
Example #6
0
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True):
    download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = _CITEseq_CBMC_PREPROCESSED
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        if verbose:
            print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED)
        shutil.rmtree(_CITEseq_CBMC_PREPROCESSED)
        os.mkdir(_CITEseq_CBMC_PREPROCESSED)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8')
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        download_file(filename=zip_path,
                      url=url,
                      override=False,
                      md5=r"beb76d01a67707c61c21bfb188e1b69f")
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
        # ====== post-processing ====== #
        X = np.array(data_dict['X'].astype('float32'))
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        X, X_col = remove_allzeros_columns(matrix=X, colname=X_col)
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]
        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]
        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"
        # save data
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
        sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col)
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        sco._inplace_subset_var(result.gene_subset)
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(set(sco.var_names.values), f)
        del sco
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(
        X=ds['X'],
        cell_id=ds['X_row'],
        gene_id=ds['X_col'],
        omic='transcriptomic',
        name=f"cbmcCITEseq{'' if filtered_genes else 'all'}",
    ).add_omic('proteomic', ds['y'], ds['y_col'])
    if filtered_genes:
        with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
            top_genes = pickle.load(f)
        sco._inplace_subset_var([i in top_genes for i in sco.var_names])
    return sco
Example #7
0
def read_dataset10x(name,
                    filtered_cells=True,
                    filtered_genes=True,
                    override=False,
                    verbose=True) -> SingleCellOMIC:
    r""" Predefined procedure for download and preprocessing 10x dataset into
  `SingleCellOMIC` i.e. scanpy.AnnData object

  Reference:
    https://artyomovlab.wustl.edu/publications/supp_materials/4Oleg/2019_sc_ATAC_seq_DT1634_Denis/sc-atacseq-explorer-Denis-121119.html

  """
    ### prepare the URL
    name = str(name).lower().strip()
    spec = 'filtered' if filtered_cells else 'raw'
    flatten_datasets = [(exp, version, dsname)
                        for exp, i in all_datasets.items()
                        for version, j in i.items() for dsname in j]
    found = []
    for exp, version, dsname in flatten_datasets:
        if name == dsname:
            found.append((exp, version, dsname))
    if not found:
        raise ValueError(f"Cannot find data with name {name}, "
                         f"all available datasets are: {flatten_datasets}")
    if len(found) > 1:
        raise RuntimeError(
            f"Found multiple datasets {found} with name='{name}'")
    exp, version, name = found[0]
    dataset_name = name + '_' + spec
    url = group_to_url_skeleton[exp][version].format(version, name, name, spec)
    ### prepare the output path
    filename = os.path.basename(url)
    # download path
    download_path = os.path.join(DOWNLOAD_DIR, exp, version)
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    # preprocessing path
    preprocessed_path = os.path.join(DATA_DIR,
                                     f'10x_{exp}_{name}_{spec}_preprocessed')
    if override and os.path.exists(preprocessed_path):
        if verbose:
            print("Overriding path: %s" % preprocessed_path)
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        if verbose:
            print("Dataset10X:")
            print(" Meta       :", found)
            print(" File       :", filename)
            print(" URL        :", url)
            print(" Download   :", download_path)
            print(" Preprocess :", preprocessed_path)
        ### download the tar file
        path = download_file(url=url,
                             filename=os.path.join(download_path, filename),
                             override=False,
                             md5=_MD5.get(f"{exp}*{version}*{name}*{spec}",
                                          None))
        if not tarfile.is_tarfile(path):
            raise RuntimeError("Expecting tarfile but received: %s" % path)
        contents = {}
        with tarfile.open(path, mode="r:gz") as f:
            all_files = [(path, info.name, info.size, verbose) for info in f
                         if info.isfile()]
        for name, data in MPI(jobs=all_files,
                              func=_read_tarinfo,
                              batch=1,
                              ncpu=4):
            contents[name] = data
        # cell barcodes
        barcodes = contents['barcodes']
        ### cell-atac
        if exp == 'cell-atac':
            n_top_genes = 20000  # this is ad-hoc value
            X = contents['matrix'].T.todense()
            peaks = contents['peaks']
            X_peaks = peaks[:, 2].astype(np.float32) - peaks[:, 1].astype(
                np.float32)
            X_col_name = np.array([':'.join(i) for i in peaks])
            save_data = [(OMIC.atac.name, X)]
            save_metadata = dict(main_omic=OMIC.atac.name,
                                 barcodes=barcodes,
                                 chromatin_var=X_col_name)
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.atac,
                                 name=name)
        ### cell-exp and cell-vdj
        elif exp in ('cell-exp', 'cell-vdj'):
            n_top_genes = 2000
            # feature (Id, Name, Type(antibody or gene-expression))
            X_col = contents[
                'features'] if 'features' in contents else contents['genes']
            # data matrix
            X = contents['matrix'].T
            if not isinstance(X, csr_matrix) and hasattr(X, 'tocsr'):
                X = X.tocsr()
            X = X.astype('float32')
            assert X.shape[0] == barcodes.shape[0] and X.shape[
                1] == X_col.shape[0]
            # antibody and gene are provided
            prot_ids = []
            pmhc_ids = []
            gene_ids = []
            if X_col.shape[1] == 3:
                for idx, (feat_id, feat_name, feat_type) in enumerate(X_col):
                    if feat_type == 'Antibody Capture':
                        if exp == "cell-vdj" and "_TotalSeqC" not in feat_name:
                            pmhc_ids.append(idx)
                        else:
                            prot_ids.append(idx)
                    elif feat_type == 'Gene Expression':
                        gene_ids.append(idx)
                    else:
                        raise ValueError(
                            f"Unknown feature type:{feat_id}-{feat_name}-{feat_type}"
                        )
            elif X_col.shape[1] == 2:
                gene_ids = slice(None, None)
            else:
                raise ValueError(f"No support for features matrix\n{X_col}")
            # Antibody ID, Antibody Name
            y = X[:, prot_ids]
            y_col = X_col[prot_ids][:, 0]  # the id
            y_col_name = X_col[prot_ids][:, 1]  # the name
            # pMHC peptide
            if len(pmhc_ids) > 0:
                z = X[:, pmhc_ids]
                z_col = X_col[pmhc_ids][:, 0]  # the id
                z_col_name = X_col[pmhc_ids][:, 1]  # the name
            # Gene ID, Gene Name
            X = X[:, gene_ids].todense()
            X_col_name = X_col[gene_ids][:, 1]  # the name
            X_col = X_col[gene_ids][:, 0]  # the id
            assert np.min(X) >= 0 and np.max(X) < 65000, \
              f"Only support uint16 data type, given data with max={np.max(X)}"
            # data and metadata
            sco = SingleCellOMIC(X,
                                 cell_id=barcodes,
                                 gene_id=X_col_name,
                                 omic=OMIC.transcriptomic,
                                 name=name)
            save_data = [(OMIC.transcriptomic.name, X),
                         (OMIC.proteomic.name, y)]
            save_metadata = {
                'main_omic': OMIC.transcriptomic.name,
                'barcodes': barcodes,
                f"{OMIC.transcriptomic.name}_var": X_col_name,
                f"{OMIC.proteomic.name}_var": y_col_name
            }
            if len(pmhc_ids) > 0:
                save_data.append((OMIC.pmhc.name, z))
                save_metadata[f"{OMIC.pmhc.name}_var"] = z_col_name
        ### others
        else:
            raise NotImplementedError(f"No support for experiment: {exp}")
        ### save data and metadata
        for name, data in save_data:
            outpath = os.path.join(preprocessed_path, name)
            n_samples, n_features = data.shape
            if n_samples == 0 or n_features == 0:
                continue
            with MmapArrayWriter(outpath,
                                 shape=(0, n_features),
                                 dtype=np.uint16,
                                 remove_exist=True) as f:
                if verbose:
                    prog = tqdm(f"Saving {outpath}",
                                total=n_samples,
                                unit='samples')
                for s, e in batching(batch_size=5120, n=n_samples):
                    x = data[s:e]
                    if hasattr(x, 'todense'):
                        x = x.todense()
                    f.write(x)
                    if verbose:
                        prog.update(e - s)
                if verbose:
                    prog.clear()
                    prog.close()
        # save metadata
        outpath = os.path.join(preprocessed_path, 'metadata')
        with open(outpath, 'wb') as f:
            pickle.dump(save_metadata, f)
        if verbose:
            print(f"Saved metadata to path {outpath}")
        ### filter genes, follow 10x and use Cell Ranger recipe,
        # this is copied from Scanpy
        n_genes = sco.shape[1]
        sc.pp.filter_genes(sco, min_counts=1)
        # normalize with total UMI count per cell
        sc.pp.normalize_total(sco, key_added='n_counts_all')
        filter_result = sc.pp.filter_genes_dispersion(sco.X,
                                                      flavor='cell_ranger',
                                                      n_top_genes=n_top_genes,
                                                      log=False)
        gene_subset = filter_result.gene_subset
        indices = sco.get_var_indices()
        markers = (MARKER_GENES
                   if sco.current_omic == OMIC.transcriptomic else MARKER_ATAC)
        for name in markers:
            idx = indices.get(name, None)
            if idx is not None:
                gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)  # filter genes
        if verbose:
            print(
                f"Filtering genes {n_genes} to {sco.shape[1]} variated genes.")
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(sco.var_names.values, f)
    # ******************** load and return the dataset ******************** #
    omics = [
        name for name in os.listdir(preprocessed_path)
        if name not in ('metadata', 'top_genes') and '_' not in name
    ]
    with open(os.path.join(preprocessed_path, 'metadata'), 'rb') as f:
        metadata = pickle.load(f)
    with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
        top_genes = pickle.load(f)
    data = {
        name: MmapArray(os.path.join(preprocessed_path,
                                     name)).astype(np.float32)
        for name in omics
    }
    main_omic = metadata['main_omic']
    X = data[main_omic]
    var_names = metadata[f'{main_omic}_var']
    if filtered_genes:
        var_ids = {j: i for i, j in enumerate(var_names)}
        ids = [var_ids[i] for i in top_genes]
        X = X[:, ids]
        var_names = var_names[ids]
    sco = SingleCellOMIC(
        X,
        cell_id=metadata['barcodes'],
        gene_id=var_names,
        omic=main_omic,
        name=f"{dataset_name}{'' if filtered_genes else 'all'}")
    for o in omics:
        if o != main_omic:
            sco.add_omic(omic=o,
                         X=data[o],
                         var_names=np.asarray(metadata[f'{o}_var']))
    return sco
Example #8
0
def read_CITEseq_PBMC(override=False,
                      verbose=True,
                      filtered_genes=False) -> SingleCellOMIC:
  download_path = os.path.join(
      DOWNLOAD_DIR,
      "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq'))
  if not os.path.exists(download_path):
    os.makedirs(download_path)
  preprocessed_path = (_5000_PBMC_PREPROCESSED
                       if filtered_genes else _CITEseq_PBMC_PREPROCESSED)
  if override:
    shutil.rmtree(preprocessed_path)
    os.makedirs(preprocessed_path)
  # ******************** preprocessed data NOT found ******************** #
  if not os.path.exists(os.path.join(preprocessed_path, 'X')):
    X, X_row, X_col = [], None, None
    y, y_row, y_col = [], None, None
    # ====== download the data ====== #
    download_files = {}
    for url, md5 in zip(
        [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN],
        [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]):
      url = str(base64.decodebytes(url), 'utf-8')
      base_name = os.path.basename(url)
      path = os.path.join(download_path, base_name)
      download_file(filename=path, url=url, override=False)
      download_files[base_name] = (path, md5)
    # ====== extract the data ====== #
    n = set()
    for name, (path, md5) in sorted(download_files.items()):
      if verbose:
        print(f"Extracting {name} ...")
      binary_data = decrypt_aes(path, password=_PASSWORD)
      md5_ = md5_checksum(binary_data)
      assert md5_ == md5, f"MD5 checksum mismatch for file: {name}"
      with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f:
        for name in f.namelist():
          data = str(f.read(name), 'utf8')
          for line in data.split('\n'):
            if len(line) == 0:
              continue
            line = line.strip().split(',')
            n.add(len(line))
            if 'Protein' in name:
              y.append(line)
            else:
              X.append(line)
    # ====== post-processing ====== #
    assert len(n) == 1, \
    "Number of samples inconsistent between raw count and protein count"
    if verbose:
      print("Processing gene count ...")
    X = np.array(X).T
    X_row, X_col = X[1:, 0], X[0, 1:]
    X = X[1:, 1:].astype('float32')
    # ====== filter mouse genes ====== #
    human_cols = [True if "HUMAN_" in i else False for i in X_col]
    if verbose:
      print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...")
    X = X[:, human_cols]
    X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]])
    X, X_col = remove_allzeros_columns(matrix=X,
                                       colname=X_col,
                                       print_log=verbose)

    # ====== protein ====== #
    if verbose:
      print("Processing protein count ...")
    y = np.array(y).T
    y_row, y_col = y[1:, 0], y[0, 1:]
    y = y[1:, 1:].astype('float32')
    assert np.all(X_row == y_row), \
    "Cell order mismatch between gene count and protein count"
    # save data
    if verbose:
      print(f"Saving data to {preprocessed_path} ...")
    save_to_dataset(preprocessed_path,
                    X,
                    X_col,
                    y,
                    y_col,
                    rowname=X_row,
                    print_log=verbose)
  # ====== read preprocessed data ====== #
  ds = Dataset(preprocessed_path, read_only=True)
  return SingleCellOMIC(
      X=ds['X'],
      cell_id=ds['X_row'],
      gene_id=ds['X_col'],
      omic='transcriptomic',
      name=f"pbmcCITEseq{'' if filtered_genes else 'all'}",
  ).add_omic('proteomic', ds['y'], ds['y_col'])
def read_leukemia_MixedPhenotypes(filtered_genes=True,
                                  omic='rna',
                                  ignore_na=True,
                                  override=False,
                                  verbose=True) -> SingleCellOMIC:
  r""" Integrates highly multiplexed protein quantification, transcriptome
  profiling, and chromatin accessibility analysis. Using this approach,
  we establish a normal epigenetic baseline for healthy blood development,
  which we then use to deconvolve aberrant molecular features within blood
  from mixed-phenotype acute leukemia (MPAL) patients.

  scATAC-seq and CITE-seq performed on healthy bone marrow, CD34+ bone marrow,
  peripheral blood, and MPAL donors

  References:
    Granja JM et al., 2019. "Single-cell multiomic analysis identifies
      regulatory  programs in mixed-phenotype acute leukemia".
      Nature Biotechnology.
    https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139369
    https://github.com/GreenleafLab/MPAL-Single-Cell-2019
  """
  ### prepare the path
  download_dir = os.path.join(DOWNLOAD_DIR, 'mpal')
  if not os.path.exists(download_dir):
    os.makedirs(download_dir)
  preprocessed_path = os.path.join(DATA_DIR, 'mpal_preprocessed')
  if override:
    shutil.rmtree(preprocessed_path)
    if verbose:
      print(f"Override preprocessed data at {preprocessed_path}")
  if not os.path.exists(preprocessed_path):
    os.makedirs(preprocessed_path)
  ### download
  files = {}
  for name, (url, md5) in _URL.items():
    path = download_file(url=url,
                         filename=os.path.join(download_dir,
                                               os.path.basename(url)),
                         override=False,
                         md5=md5)
    files[name] = path
  ### read the files
  if omic == 'atac':
    del files['rna']
    del files['adt']
  elif omic == 'rna':
    del files['atac']
  else:
    raise NotImplementedError(f"No support for omic type: {omic}")
  all_data = {}
  for name, data in MPI(jobs=list(files.items()),
                        func=partial(_read_data,
                                     verbose=True,
                                     preprocessed_path=preprocessed_path),
                        batch=1,
                        ncpu=4):
    all_data[name] = data.load()
  ### load scRNA and ADT
  if omic == 'rna':
    rna = all_data['rna']
    adt = all_data['adt']
    cell_id = list(set(rna.celldata['Barcode']) & set(adt.celldata['Barcode']))
    #
    barcode2ids = {j: i for i, j in enumerate(rna.celldata['Barcode'])}
    ids = [barcode2ids[i] for i in cell_id]
    X_rna = rna.X[ids].astype(np.float32)
    classification = rna.celldata['ProjectClassification'][ids].values
    #
    barcode2ids = {j: i for i, j in enumerate(adt.celldata['Barcode'])}
    X_adt = adt.X[[barcode2ids[i] for i in cell_id]].astype(np.float32)
    #
    if filtered_genes:
      top_genes_path = os.path.join(preprocessed_path, 'top_genes')
      if os.path.exists(top_genes_path):
        with open(top_genes_path, 'rb') as f:
          top_genes = set(pickle.load(f))
        ids = [i for i, j in enumerate(rna.genenames) if j in top_genes]
        sco = SingleCellOMIC(X_rna[:, ids],
                             cell_id=cell_id,
                             gene_id=rna.genenames[ids],
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
      else:
        sco = SingleCellOMIC(X_rna,
                             cell_id=cell_id,
                             gene_id=rna.genenames,
                             omic=OMIC.transcriptomic,
                             name='mpalRNA')
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        # make sure all marker genes are included
        gene_subset = result.gene_subset
        gene_indices = sco.get_var_indices()
        for gene in MARKER_GENES:
          idx = gene_indices.get(gene, None)
          if idx is not None:
            gene_subset[idx] = True
        sco._inplace_subset_var(gene_subset)
        with open(top_genes_path, 'wb') as f:
          pickle.dump(sco.var_names.values, f)
    else:
      sco = SingleCellOMIC(X_rna,
                           cell_id=cell_id,
                           gene_id=rna.genenames,
                           omic=OMIC.transcriptomic,
                           name='mpalRNAall')
    # loading dataset
    if ignore_na:
      ids = np.logical_not(np.isnan(np.max(X_adt, axis=0)))
      sco.add_omic(OMIC.proteomic, X_adt[:, ids], adt.genenames[ids])
    else:
      sco.add_omic(OMIC.proteomic, X_adt, adt.genenames)
    y, labels = _celltypes(classification)
    sco.add_omic(OMIC.celltype, y, labels)
    exon = {i: j for i, j in rna.genedata[['gene_name', 'exonLength']].values}
    sco.var['exonlength'] = np.array([exon[i] for i in sco.var_names],
                                     dtype=np.float32)
  ### load ATAC
  else:
    atac = all_data['atac']
    sco = SingleCellOMIC(atac.X.astype(np.float32),
                         cell_id=atac.celldata['Barcode'],
                         gene_id=atac.genenames,
                         omic=OMIC.atac,
                         name='mpalATAC')
    y, labels = _celltypes(atac.celldata['ProjectClassification'].values)
    sco.add_omic(OMIC.celltype, y, labels)
    sco.obs['clusters'] = atac.celldata['Clusters'].values
    sco.var['score'] = atac.genedata['score'].values
  return sco