Beispiel #1
0
 def get_dataset(clazz, ext='', override=False):
     # ====== all path ====== #
     name = clazz.get_name(ext) + '.zip'
     path = base64.decodebytes(DataLoader.ORIGIN).decode() + name
     zip_path = clazz.get_zip_path(ext)
     out_path = clazz.get_ds_path(ext)
     # ====== check out_path ====== #
     if os.path.isfile(out_path):
         raise RuntimeError("Found a file at path: %s, we need a folder "
                            "to unzip downloaded files." % out_path)
     elif os.path.isdir(out_path):
         if override or len(os.listdir(out_path)) == 0:
             shutil.rmtree(out_path)
         else:
             return Dataset(out_path, read_only=True)
     # ====== download the file ====== #
     if os.path.exists(zip_path) and override:
         os.remove(zip_path)
     if not os.path.exists(zip_path):
         get_file(name, path, DataLoader.BASE_DIR)
     # ====== upzip dataset ====== #
     unzip_aes(in_path=zip_path, out_path=out_path)
     ds = Dataset(out_path, read_only=True)
     if os.path.exists(zip_path):
         os.remove(zip_path)
     return ds
Beispiel #2
0
 def get_dataset(clazz, ext='', override=False):
   # ====== all path ====== #
   name = clazz.get_name(ext) + '.zip'
   path = base64.decodebytes(DataLoader.ORIGIN).decode() + name
   zip_path = clazz.get_zip_path(ext)
   out_path = clazz.get_ds_path(ext)
   # ====== check out_path ====== #
   if os.path.isfile(out_path):
     raise RuntimeError("Found a file at path: %s, we need a folder "
                        "to unzip downloaded files." % out_path)
   elif os.path.isdir(out_path):
     if override or len(os.listdir(out_path)) == 0:
       shutil.rmtree(out_path)
     else:
       return Dataset(out_path, read_only=True)
   # ====== download the file ====== #
   if os.path.exists(zip_path) and override:
     os.remove(zip_path)
   if not os.path.exists(zip_path):
     get_file(name, path, DataLoader.BASE_DIR)
   # ====== upzip dataset ====== #
   unzip_aes(in_path=zip_path, out_path=out_path)
   ds = Dataset(out_path, read_only=True)
   if os.path.exists(zip_path):
     os.remove(zip_path)
   return ds
Beispiel #3
0
def read_CITEseq_CBMC(filtered_genes=True, override=False, verbose=True):
    download_path = os.path.join(DOWNLOAD_DIR, "CBMC_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)
    preprocessed_path = _CITEseq_CBMC_PREPROCESSED
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        if verbose:
            print("Overriding path: %s" % _CITEseq_CBMC_PREPROCESSED)
        shutil.rmtree(_CITEseq_CBMC_PREPROCESSED)
        os.mkdir(_CITEseq_CBMC_PREPROCESSED)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8')
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        download_file(filename=zip_path,
                      url=url,
                      override=False,
                      md5=r"beb76d01a67707c61c21bfb188e1b69f")
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
        # ====== post-processing ====== #
        X = np.array(data_dict['X'].astype('float32'))
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        X, X_col = remove_allzeros_columns(matrix=X, colname=X_col)
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]
        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]
        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"
        # save data
        if verbose:
            print(f"Saving data to {preprocessed_path} ...")
        save_to_dataset(preprocessed_path,
                        X,
                        X_col,
                        y,
                        y_col,
                        rowname=X_row,
                        print_log=verbose)
        sco = SingleCellOMIC(X, cell_id=X_row, gene_id=X_col)
        sc.pp.filter_cells(sco, min_genes=200)
        sc.pp.filter_genes(sco, min_cells=3)
        sc.pp.normalize_total(sco, target_sum=1e4)
        result = sc.pp.filter_genes_dispersion(sco.X,
                                               min_mean=0.0125,
                                               max_mean=3,
                                               min_disp=0.5,
                                               log=False,
                                               n_top_genes=2000)
        sco._inplace_subset_var(result.gene_subset)
        with open(os.path.join(preprocessed_path, 'top_genes'), 'wb') as f:
            pickle.dump(set(sco.var_names.values), f)
        del sco
    # ====== read preprocessed data ====== #
    ds = Dataset(preprocessed_path, read_only=True)
    sco = SingleCellOMIC(
        X=ds['X'],
        cell_id=ds['X_row'],
        gene_id=ds['X_col'],
        omic='transcriptomic',
        name=f"cbmcCITEseq{'' if filtered_genes else 'all'}",
    ).add_omic('proteomic', ds['y'], ds['y_col'])
    if filtered_genes:
        with open(os.path.join(preprocessed_path, 'top_genes'), 'rb') as f:
            top_genes = pickle.load(f)
        sco._inplace_subset_var([i in top_genes for i in sco.var_names])
    return sco
Beispiel #4
0
def read_FACS(n_protein, override=False, verbose=False):
    download_path = os.path.join(DOWNLOAD_DIR, "FACS_original")
    if not os.path.exists(download_path):
        os.mkdir(download_path)

    n_protein = int(n_protein)
    assert n_protein in (2, 5)

    preprocessed_path = _FACS_PREPROCESSED % n_protein
    if not os.path.exists(preprocessed_path):
        os.mkdir(preprocessed_path)
    elif override:
        shutil.rmtree(preprocessed_path)
        os.mkdir(preprocessed_path)
    # ******************** preprocessed data NOT found ******************** #
    if not os.path.exists(os.path.join(preprocessed_path, 'X')):
        X, X_row, X_col = [], None, None
        y, y_row, y_col = [], None, None
        # ====== download the data ====== #
        url = str(base64.decodebytes(_URL), 'utf-8') % n_protein
        base_name = os.path.basename(url)
        zip_path = os.path.join(download_path, base_name)
        urlretrieve(url=url, filename=zip_path)
        # ====== extract the data ====== #
        data_dict = {}
        for name, data in crypto.unzip_aes(zip_path,
                                           password=_PASSWORD,
                                           verbose=False):
            base_name = os.path.splitext(name)[0]
            if '.npz' in name:
                data = sp.sparse.load_npz(BytesIO(data)).todense()
            elif '.csv' in name:
                data = np.loadtxt(StringIO(str(data, 'utf-8')),
                                  dtype=str,
                                  delimiter=',')
            else:
                raise RuntimeError("Unknown format: %s" % name)
            data_dict[base_name] = data
            if verbose:
                print('%-12s' % base_name, ':', data.shape)
        # ====== post-processing ====== #
        X = data_dict['X'].astype('float32')
        X = np.array(X)
        X_row, X_col = data_dict['X_row'], data_dict['X_col']
        assert len(X_row) == X.shape[0] and len(X_col) == X.shape[1]

        y = data_dict['y'].astype('float32')
        y_row, y_col = data_dict['y_row'], data_dict['y_col']
        assert len(y_row) == y.shape[0] and len(y_col) == y.shape[1]

        assert np.all(X_row == y_row), \
        "Cell order mismatch between gene count and protein count"

        # ====== filter zero columns ====== #
        X, X_col = remove_allzeros_columns(matrix=X,
                                           colname=X_col,
                                           print_log=verbose)

        save_to_dataset(path=preprocessed_path,
                        X=X,
                        X_col=X_col,
                        y=y,
                        y_col=y_col,
                        rowname=X_row,
                        print_log=verbose)
    # ******************** read preprocessed data ******************** #
    ds = Dataset(preprocessed_path, read_only=True)
    return ds