Esempio n. 1
0
def read_PBMCeec(subset='ly',
                 override=False,
                 verbose=True,
                 filtered_genes=True) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    if subset in ('my', 'full'):
        raise NotImplementedError("No support for subset: %s - PBMCecc" %
                                  subset)
    download_path = os.path.join(DOWNLOAD_DIR, "PBMCecc_%s_original" % subset)
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMCecc_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
        if verbose:
            print(f"Override preprocessed data at path {preprocessed_path}")
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        # ====== full ====== #
        if subset == 'full':
            raise NotImplementedError
        # ====== ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # ====== extract the data ====== #
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_var']
                X_col = data['X_var_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array(['ly'] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"ecc{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
Esempio n. 2
0
def read_PBMC8k(subset='full',
                override=False,
                verbose=True,
                filtered_genes=True,
                return_arrays=False) -> SingleCellOMIC:
    subset = str(subset).strip().lower()
    if subset not in ('ly', 'my', 'full'):
        raise ValueError(
            "subset can only be 'ly'-lymphoid and 'my'-myeloid or 'full'")
    # prepare the path
    download_path = os.path.join(DOWNLOAD_DIR, f"PBMC8k_{subset}_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = os.path.join(
        DATA_DIR,
        f"PBMC8k_{subset}_{'filtered' if filtered_genes else 'all'}_preprocessed"
    )
    if override and os.path.exists(preprocessed_path):
        shutil.rmtree(preprocessed_path)
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    # ******************** preprocessed ******************** #
    if len(os.listdir(preprocessed_path)) == 0:
        # ====== pbmc 8k ====== #
        if subset == 'full':
            ly = read_PBMC8k('ly',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            my = read_PBMC8k('my',
                             filtered_genes=filtered_genes,
                             return_arrays=True)
            url = str(base64.decodebytes(_URL_PBMC8k), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # load data
            data = np.load(path)
            X = data['X']
            X_row = data['X_row']
            X_col = data['X_col'].tolist()
            y = data['y']
            y_col = data['y_col'].tolist()
            # merge all genes from my and ly subset
            all_genes = set(ly['X_col'].tolist() + my['X_col'].tolist())
            all_genes = sorted([X_col.index(i) for i in all_genes])
            # same for protein
            all_proteins = set(ly['y_col'].tolist() + my['y_col'].tolist())
            all_proteins = sorted([y_col.index(i) for i in all_proteins])
            #
            X = X[:, all_genes]
            y = y[:, all_proteins]
            X_col = np.array(X_col)[all_genes]
            y_col = np.array(y_col)[all_proteins]
            cell_types = np.array(
                ['ly' if i in ly['X_row'] else 'my' for i in X_row])
        # ====== pbmc ly and my ====== #
        else:
            url = str(
                base64.decodebytes(_URL_LYMPHOID if subset ==
                                   'ly' else _URL_MYELOID), 'utf-8')
            base_name = os.path.basename(url)
            path = os.path.join(download_path, base_name)
            download_file(filename=path, url=url, override=False)
            # extract the data
            data = np.load(path)
            X_row = data['X_row']
            y = data['y']
            y_col = data['y_col']
            if filtered_genes:
                X = data['X_filt']
                X_col = data['X_filt_col']
            else:
                X = data['X_full']
                X_col = data['X_full_col']
            cell_types = np.array([subset] * X.shape[0])
        # ====== save everything ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        assert X.shape == (len(X_row), len(X_col))
        assert len(X) == len(y)
        assert y.shape[1] == len(y_col)
        with open(os.path.join(preprocessed_path, 'cell_types'), 'wb') as f:
            pickle.dump(cell_types, f)
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    if return_arrays:
        return ds
    sco = SingleCellOMIC(X=ds['X'],
                         cell_id=ds['X_row'],
                         gene_id=ds['X_col'],
                         omic='transcriptomic',
                         name=f"8k{subset}{'' if filtered_genes else 'all'}")
    sco.add_omic('proteomic', X=ds['y'], var_names=ds['y_col'])
    progenitor = ds['cell_types']
    sco.add_omic(
        'progenitor',
        X=np.array([(1, 0) if i == 'my' else (0, 1) for i in progenitor],
                   dtype=np.float32),
        var_names=np.array(['myeloid', 'lymphoid']),
    )
    return sco
Esempio n. 3
0
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True):
    download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = _CITEseq_CBMC_PREPROCESSED
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        if verbose:
            print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED)
        shutil.rmtree(_CITEseq_CBMC_PREPROCESSED)
        os.mkdir(_CITEseq_CBMC_PREPROCESSED)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8')
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        download_file(filename=zip_path,
                      url=url,
                      override=False,
                      md5=r"beb76d01a67707c61c21bfb188e1b69f")
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
        # ====== post-processing ====== #
        X = np.array(data_dict['X'].astype('float32'))
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        X, X_col = remove_allzeros_columns(matrix=X, colname=X_col)
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]
        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]
        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"
        # save data
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
        sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col)
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        sco._inplace_subset_var(result.gene_subset)
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(set(sco.var_names.values), f)
        del sco
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(
        X=ds['X'],
        cell_id=ds['X_row'],
        gene_id=ds['X_col'],
        omic='transcriptomic',
        name=f"cbmcCITEseq{'' if filtered_genes else 'all'}",
    ).add_omic('proteomic', ds['y'], ds['y_col'])
    if filtered_genes:
        with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
            top_genes = pickle.load(f)
        sco._inplace_subset_var([i in top_genes for i in sco.var_names])
    return sco
Esempio n. 4
0
def read_CITEseq_PBMC(override=False,
                      verbose=True,
                      filtered_genes=False) -> SingleCellOMIC:
  download_path = os.path.join(
      DOWNLOAD_DIR,
      "PBMC_%s_original" % ('5000' if filtered_genes else 'CITEseq'))
  if not os.path.exists(download_path):
    os.makedirs(download_path)
  preprocessed_path = (_5000_PBMC_PREPROCESSED
                       if filtered_genes else _CITEseq_PBMC_PREPROCESSED)
  if override:
    shutil.rmtree(preprocessed_path)
    os.makedirs(preprocessed_path)
  # ******************** preprocessed data NOT found ******************** #
  if not os.path.exists(os.path.join(preprocessed_path, 'X')):
    X, X_row, X_col = [], None, None
    y, y_row, y_col = [], None, None
    # ====== download the data ====== #
    download_files = {}
    for url, md5 in zip(
        [_URL_5000 if filtered_genes else _URL_FULL, _URL_PROTEIN],
        [_MD5_5000 if filtered_genes else _MD5_FULL, _MD5_PROTEIN]):
      url = str(base64.decodebytes(url), 'utf-8')
      base_name = os.path.basename(url)
      path = os.path.join(download_path, base_name)
      download_file(filename=path, url=url, override=False)
      download_files[base_name] = (path, md5)
    # ====== extract the data ====== #
    n = set()
    for name, (path, md5) in sorted(download_files.items()):
      if verbose:
        print(f"Extracting {name} ...")
      binary_data = decrypt_aes(path, password=_PASSWORD)
      md5_ = md5_checksum(binary_data)
      assert md5_ == md5, f"MD5 checksum mismatch for file: {name}"
      with zipfile.ZipFile(file=BytesIO(binary_data), mode='r') as f:
        for name in f.namelist():
          data = str(f.read(name), 'utf8')
          for line in data.split('\n'):
            if len(line) == 0:
              continue
            line = line.strip().split(',')
            n.add(len(line))
            if 'Protein' in name:
              y.append(line)
            else:
              X.append(line)
    # ====== post-processing ====== #
    assert len(n) == 1, \
    "Number of samples inconsistent between raw count and protein count"
    if verbose:
      print("Processing gene count ...")
    X = np.array(X).T
    X_row, X_col = X[1:, 0], X[0, 1:]
    X = X[1:, 1:].astype('float32')
    # ====== filter mouse genes ====== #
    human_cols = [True if "HUMAN_" in i else False for i in X_col]
    if verbose:
      print(f"Removing {np.sum(np.logical_not(human_cols))} MOUSE genes ...")
    X = X[:, human_cols]
    X_col = np.array([i.replace('HUMAN_', '') for i in X_col[human_cols]])
    X, X_col = remove_allzeros_columns(matrix=X,
                                       colname=X_col,
                                       print_log=verbose)

    # ====== protein ====== #
    if verbose:
      print("Processing protein count ...")
    y = np.array(y).T
    y_row, y_col = y[1:, 0], y[0, 1:]
    y = y[1:, 1:].astype('float32')
    assert np.all(X_row == y_row), \
    "Cell order mismatch between gene count and protein count"
    # save data
    if verbose:
      print(f"Saving data to {preprocessed_path} ...")
    save_to_dataset(preprocessed_path,
                    X,
                    X_col,
                    y,
                    y_col,
                    rowname=X_row,
                    print_log=verbose)
  # ====== read preprocessed data ====== #
  ds = Dataset(preprocessed_path, read_only=True)
  return SingleCellOMIC(
      X=ds['X'],
      cell_id=ds['X_row'],
      gene_id=ds['X_col'],
      omic='transcriptomic',
      name=f"pbmcCITEseq{'' if filtered_genes else 'all'}",
  ).add_omic('proteomic', ds['y'], ds['y_col'])
Esempio n. 5
0
def read_full_FACS(override=False, verbose=False):
    """ https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE75478
  This is the full FACS data of 2 individuals with 7 protein markers
  """
    download_path = os.path.join(DOWNLOAD_DIR, "FACS_full")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    # ====== download the data ====== #
    file_url = [
        ('GSE75478_transcriptomics_facs_indeces_filtered_I1.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI1%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_facs_indeces_filtered_I2.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Ffacs%5Findeces%5Ffiltered%5FI2%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_raw_filtered_I1.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI1%2Ecsv%2Egz'
         ),
        ('GSE75478_transcriptomics_raw_filtered_I2.csv.gz',
         'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE75478&format=file&file=GSE75478%5Ftranscriptomics%5Fraw%5Ffiltered%5FI2%2Ecsv%2Egz'
         ),
    ]
    for name, url in file_url:
        filename = os.path.join(download_path, name)
        if not os.path.exists(filename):
            if verbose:
                print("Downloading file '{filename}' ...")
            urlretrieve(url=url, filename=filename)
    # ====== extract the data ====== #
    preprocessed_path = _FACS_PREPROCESSED % 7
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        data_map = {}
        for name, _ in file_url:
            zip_path = os.path.join(download_path, name)
            with gzip.open(zip_path, 'rb') as f:
                data_map[name.split('.')[0]] = np.array(
                    [str(line, 'utf-8').strip().split(',') for line in f]).T

        i1 = data_map['GSE75478_transcriptomics_raw_filtered_I1']
        f1 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I1']

        i2 = data_map['GSE75478_transcriptomics_raw_filtered_I2']
        f2 = data_map['GSE75478_transcriptomics_facs_indeces_filtered_I2']
        # Matching duplicated row in `i` and `f`
        row_name = set(i1[1:, 0]) & set(f1[1:, 0])
        i1 = i1[[True] + [True if i in row_name else False
                          for i in i1[1:, 0]], :]
        f1 = f1[[True] + [True if i in row_name else False
                          for i in f1[1:, 0]], :]
        assert np.all(i1[:, 0] == f1[:, 0])

        row_name = set(i2[1:, 0]) & set(f2[1:, 0])
        i2 = i2[[True] + [True if i in row_name else False
                          for i in i2[1:, 0]], :]
        f2 = f2[[True] + [True if i in row_name else False
                          for i in f2[1:, 0]], :]
        assert np.all(i2[:, 0] == f2[:, 0])

        # Matching the genes and protein among individuals
        gene_name = set(i1[0][1:]) & set(i2[0][1:])
        i1 = i1[:, [True] +
                [True if i in gene_name else False for i in i1[0][1:]]]
        i2 = i2[:, [True] +
                [True if i in gene_name else False for i in i2[0][1:]]]
        assert np.all(i1[0] == i2[0])
        gene = np.concatenate((i1, i2[1:]), axis=0)

        prot_name = set(
            [i for i in set(f1[0][1:]) & set(f2[0][1:]) if '_cd' in i])
        prot_name = sorted(prot_name)
        f1 = f1[:, [0] + [f1[0].tolist().index(i) for i in prot_name]]
        f2 = f2[:, [0] + [f2[0].tolist().index(i) for i in prot_name]]
        assert np.all(f1[0] == f2[0])
        prot = np.concatenate((f1, f2[1:]), axis=0)

        # ====== save data to disk ====== #
        X = gene[1:, 1:].astype('float32')
        X_row = gene[1:, 0]
        X_col = gene[0, 1:]
        X_col = np.array([i.replace('"', '') for i in X_col])

        y = prot[1:, 1:].astype('float32')
        y_row = prot[1:, 0]
        y_col = np.array(
            [i.replace('"', '').split('_')[-1].upper() for i in prot[0, 1:]])

        assert np.all(X_row == y_row)
        X_row = np.array([i.replace('"', '') for i in X_row])

        # ====== the protein marker can be smaller than zero ====== #
        min_values = np.min(y, axis=0, keepdims=True)
        min_values = np.where(min_values > 0, 0, min_values)
        y = y + np.abs(min_values)
        # ====== filter zero columns ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)
        save_to_dataset(path=preprocessed_path,
                        X=X,
                        X_col=X_col,
                        y=y,
                        y_col=y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds
Esempio n. 6
0
def read_FACS(n_protein, override=False, verbose=False):
    download_path = os.path.join(DOWNLOAD_DIR, "FACS_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)

    n_protein = int(n_protein)
    assert n_protein in (2, 5)

    preprocessed_path = _FACS_PREPROCESSED % n_protein
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8') % n_protein
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        urlretrieve(url=url, filename=zip_path)
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
            if verbose:
                print('%-12s' % base_name, ':', data.shape)
        # ====== post-processing ====== #
        X = data_dict['X'].astype('float32')
        X = np.array(X)
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]

        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]

        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"

        # ====== filter zero columns ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)

        save_to_dataset(path=preprocessed_path,
                        X=X,
                        X_col=X_col,
                        y=y,
                        y_col=y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds