Ejemplo n.º 1
0
 def testMaxVarGenes(self):
     """
     test max variance genes for dense and sparse matrices
     """
     n_genes =self.data_sparse.shape[0]
     genes1 = uncurl.max_variance_genes(self.data_dense, nbins=1, frac=0.5)
     genes2 = uncurl.max_variance_genes(self.data_sparse, nbins=1, frac=0.5)
     self.assertEqual(set(genes1), set(genes2))
     self.assertEqual(len(genes1), int(0.5*n_genes))
     genes1 = uncurl.max_variance_genes(self.data_dense, nbins=5, frac=0.2)
     genes2 = uncurl.max_variance_genes(self.data_sparse, nbins=5, frac=0.2)
     self.assertEqual(set(genes1), set(genes2))
     self.assertEqual(len(genes1), 5*int((n_genes/5)*0.2))
 def setUp(self):
     dat = scipy.io.loadmat('data/10x_pooled_400.mat')
     self.data = scipy.sparse.csc_matrix(dat['data'])
     self.labels = dat['labels'].flatten()
     # 2. gene selection
     genes = uncurl.max_variance_genes(self.data)
     self.data_subset = self.data[genes, :]
Ejemplo n.º 3
0
def preproc_data(data):
    """
    basic data preprocessing
    """
    import uncurl
    from uncurl.preprocessing import log1p, cell_normalize
    from sklearn.decomposition import TruncatedSVD
    gene_subset = uncurl.max_variance_genes(data)
    data_subset = data[gene_subset, :]
    tsvd = TruncatedSVD(8)
    data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T)
    return data_tsvd
 def test_real_data_pairwise(self):
     mat = scipy.io.loadmat('data/10x_pooled_400.mat')
     data = mat['data']
     # do uncurl, followed by update_m
     selected_genes = uncurl.max_variance_genes(data)
     data_subset = data[selected_genes, :]
     m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50)
     m = uncurl.update_m(data, m, w, selected_genes)
     # test pairwise
     all_pvs, all_ratios = poisson_diffexp.uncurl_test_pairwise(m, w, mode='counts')
     self.assertEqual(all_pvs.shape, (data.shape[0], 8, 8))
     self.assertEqual(all_ratios.shape, (data.shape[0], 8, 8))
     self.assertTrue((all_pvs < 0.001).sum() < data.shape[0])
     self.assertTrue((all_pvs < 0.01).sum() > 100)
Ejemplo n.º 5
0
def preproc_data(data, gene_subset=False, **kwargs):
    """
    basic data preprocessing before running gap score

    Assumes that data is a matrix of shape (genes, cells).

    Returns a matrix of shape (cells, 8), using the first 8 SVD
    components. Why 8? It's an arbitrary selection...
    """
    import uncurl
    from uncurl.preprocessing import log1p, cell_normalize
    from sklearn.decomposition import TruncatedSVD
    data_subset = data
    if gene_subset:
        gene_subset = uncurl.max_variance_genes(data)
        data_subset = data[gene_subset, :]
    tsvd = TruncatedSVD(min(8, data_subset.shape[0] - 1))
    data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T)
    return data_tsvd
Ejemplo n.º 6
0
def run_partition(data, smallk, largek, method, max_depth):
    """
    Very simple recursive partitioning-based state estimation system.

    Args:
        data
        smallk (int): k for each individual clustering
        largek (int): k for the whole global clustering
    """
    # what if some cell subsets have zero gene expression values?
    # we reduce the gene subset and then re-position m
    print('run partition: data shape={0}, smallk={1}, largek={2}'.format(
        data.shape, smallk, largek))
    genes = uncurl.max_variance_genes(data, nbins=1, frac=1.0)
    results, ll = method.run(data[genes, :])
    w = results[0]
    m_ = results[1]
    m = np.zeros((data.shape[0], smallk))
    m[genes, :] = m_
    clusters_0 = w.argmax(0)
    if max_depth == 0:
        print('return at depth 0')
        return m, w
    m_new = np.zeros((m.shape[0], largek))
    w_new = np.zeros((largek, w.shape[1]))
    # the size of each sub-cluster
    n_k = largek / smallk
    for i in range(smallk):
        # TODO: how to deal with uncertain (high entropy) cells?
        # soft-cluster n percentile of the cells with the highest entropy
        # (include them in both subsets),
        # after returning, use the sub-cluster with lower entropy.
        data_c0 = data[:, clusters_0 == i]
        m_s1, w_s1 = run_partition(data_c0, smallk, largek / 2, method,
                                   max_depth - 1)
        print(m_s1.shape)
        print(w_s1.shape)
        # place the sub-results for m and w back into the big one
        k_range = range(i * n_k, (i + 1) * n_k)
        m_new[:, k_range] = m_s1
        w_new[np.ix_(k_range, clusters_0 == i)] = w_s1
    return m_new, w_new
Ejemplo n.º 7
0
    def setUp(self):
        data = scipy.io.loadmat('data/10x_pooled_400.mat')

        data_csc = data['data']
        self.labels = data['labels'].flatten()
        #gene_names = data['gene_names']

        # 2. gene selection
        genes = uncurl.max_variance_genes(data_csc)
        self.data_subset = data_csc[genes, :]
        #gene_names_subset = gene_names[genes]

        # 3. run uncurl
        m, w, ll = uncurl.run_state_estimation(self.data_subset,
                                               8,
                                               max_iters=20,
                                               inner_max_iters=50)
        print('nmi basic: ' + str(nmi(self.labels, w.argmax(0))))
        self.m = m
        self.w = w
Ejemplo n.º 8
0
 def test_10x_update_m(self):
     """
     Test after updating M
     """
     from uncurl.state_estimation import update_m
     genes = uncurl.max_variance_genes(self.data)
     data_subset = self.data[genes, :]
     # smaller # of iterations than default so it finishes faster...
     M, W, ll = uncurl.run_state_estimation(data_subset,
                                            clusters=0,
                                            max_iters=10,
                                            inner_max_iters=50)
     new_M = update_m(self.data, M, W, genes)
     self.assertEqual(new_M.shape, (self.data.shape[0], W.shape[0]))
     self.assertFalse(np.isnan(new_M).any())
     # test RMSE
     test_data = np.dot(new_M, W)
     error = self.data.toarray() - test_data
     error = np.sqrt(np.mean(error**2))
     print('M update RMSE:', error)
     self.assertTrue(error < 2.0)
 def test_real_data_1_vs_rest(self):
     mat = scipy.io.loadmat('data/10x_pooled_400.mat')
     data = mat['data']
     # do uncurl, followed by update_m
     selected_genes = uncurl.max_variance_genes(data)
     data_subset = data[selected_genes, :]
     m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50)
     m = uncurl.update_m(data, m, w, selected_genes)
     # TODO: how should the p-values be tested?
     all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w)
     all_pvs = np.array(all_pvs)
     all_ratios = np.array(all_ratios)
     self.assertTrue((all_pvs < 0.05).sum() > 100)
     self.assertTrue((all_ratios > 10).sum() > 100)
     self.assertEqual(all_pvs.shape, (data.shape[0], 8))
     all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w, mode='counts')
     all_pvs = np.array(all_pvs)
     all_ratios = np.array(all_ratios)
     self.assertEqual(all_pvs.shape, (data.shape[0], 8))
     self.assertTrue((all_pvs < 0.01).sum() > 100)
     self.assertTrue((all_pvs < 0.01).sum() < data.shape[0])
     self.assertTrue((all_ratios > 10).sum() > 100)
Ejemplo n.º 10
0
 def test_Zeisel(self):
     # gene selection
     genes = uncurl.max_variance_genes(self.data_z)
     data_subset = self.data_z[genes, :]
     # smaller # of iterations than default so it finishes faster...
     se = uncurl.experiment_runner.PoissonSE(clusters=7,
                                             max_iters=10,
                                             inner_max_iters=80)
     argmax = uncurl.experiment_runner.Argmax(n_classes=7)
     km = uncurl.experiment_runner.KM(n_classes=7)
     methods = [(se, [argmax, km])]
     results, names, other = uncurl.experiment_runner.run_experiment(
         methods,
         data_subset,
         7,
         self.labs_z,
         n_runs=1,
         use_purity=False,
         use_nmi=True)
     print(results)
     # NMI should be > 0.75 on Zeisel subset as well
     self.assertTrue(results[0][0] > 0.75)
     self.assertTrue(results[0][1] > 0.75)
Ejemplo n.º 11
0
 def test_10x_auto_cluster(self):
     """
     Test using automatic cluster size determination
     """
     from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
     # gene selection
     genes = uncurl.max_variance_genes(self.data)
     data_subset = self.data[genes, :]
     # smaller # of iterations than default so it finishes faster...
     M, W, ll = uncurl.run_state_estimation(data_subset,
                                            clusters=0,
                                            max_iters=10,
                                            inner_max_iters=80)
     labels = W.argmax(0)
     # NMI should be > 0.75 on 10x_pure_pooled
     # (accounting for lower than default iter count)
     self.assertTrue(nmi(self.labs, labels) > 0.6)
     # test RMSE
     test_data = np.dot(M, W)
     error = data_subset.toarray() - test_data
     error = np.sqrt(np.mean(error**2))
     print('data subset RMSE:', error)
     self.assertTrue(error < 2.0)
Ejemplo n.º 12
0
 def test_10xSE(self):
     # gene selection
     genes = uncurl.max_variance_genes(self.data)
     data_subset = self.data[genes, :]
     # smaller # of iterations than default so it finishes faster...
     se = uncurl.experiment_runner.PoissonSE(clusters=8,
                                             max_iters=10,
                                             inner_max_iters=80)
     argmax = uncurl.experiment_runner.Argmax(n_classes=8)
     km = uncurl.experiment_runner.KM(n_classes=8)
     methods = [(se, [argmax, km])]
     results, names, other = uncurl.experiment_runner.run_experiment(
         methods,
         data_subset,
         8,
         self.labs,
         n_runs=1,
         use_purity=False,
         use_nmi=True)
     print(results)
     # NMI should be > 0.75 on 10x_pure_pooled
     # (accounting for lower than default iter count)
     self.assertTrue(results[0][0] > 0.75)
     self.assertTrue(results[0][1] > 0.75)
Ejemplo n.º 13
0
import numpy as np
import scipy.io
from uncurl_analysis import clustering_methods
from  sklearn.metrics.cluster import normalized_mutual_info_score as nmi
import uncurl


# 1. load data
data = scipy.io.loadmat('data/10x_pooled_400.mat')

data_csc = data['data']
labels = data['labels'].flatten()
gene_names = data['gene_names']

# 2. gene selection
genes = uncurl.max_variance_genes(data_csc)
data_subset = data_csc[genes,:]
gene_names_subset = gene_names[genes]

# 3. run uncurl
m, w, ll = uncurl.run_state_estimation(data_subset, 8)
print('nmi basic: ' + str(nmi(labels, w.argmax(0))))


# 4. run clustering
for metric in ['euclidean', 'cosine']:
    for n_neighbors in [10, 15, 20]:
        print('n_neighbors: ', n_neighbors, ' metric: ', metric)
        w_graph = clustering_methods.create_graph(w.T, n_neighbors=n_neighbors, metric=metric)
        clusters = clustering_methods.run_leiden(w_graph)
        print('nmi leiden: ' + str(nmi(labels, clusters)))
Ejemplo n.º 14
0
import time

import numpy as np
from uncurl_analysis import poisson_diffexp
import scipy.io
import uncurl

mat = scipy.io.loadmat('data/10x_pooled_400.mat')
data = mat['data']
# do uncurl, followed by update_m
selected_genes = uncurl.max_variance_genes(data)
data_subset = data[selected_genes, :]
m, w, ll = uncurl.run_state_estimation(data_subset,
                                       8,
                                       max_iters=20,
                                       inner_max_iters=50)
m = uncurl.update_m(data, m, w, selected_genes)

t0 = time.time()
all_pvs, all_ratios = poisson_diffexp.uncurl_poisson_test_1_vs_rest(
    m, w, mode='counts')
print('diffexp time: ', time.time() - t0)

t0 = time.time()
all_pvs_2, all_ratios_2 = poisson_diffexp.uncurl_poisson_test_pairwise(
    m, w, mode='counts')
print('pairwise diffexp time: ', time.time() - t0)

# test on simulated data
# plotting mw
import matplotlib.pyplot as plt
Ejemplo n.º 15
0
    l3 = np.copy(l1)
    np.random.shuffle(l3)
    print(m_ndcg(l1, l2, l3))
    print(m_ndcg(l1, l1, l1))
    print(m_ndcg(l1, l2, l1))
    print(m_ndcg(l1, l2, l2))
    print(m_ndcg(l1, l3, l3))
    print(m_ndcg(l1, l1, l3))
    exit(0)
    X1 = scipy.io.mmread('data_8000_cells.mtx')
    X1 = X1.tocsc()
    true_labels1 = np.loadtxt('labels_8000_cells.txt').astype(int).flatten()

    k = 8
    frac = 0.2
    genes = uncurl.max_variance_genes(X1, nbins=5, frac=frac)
    data_subset = X1[genes, :]
    n_genes = data_subset.shape[0]

    # TODO: run uncurl

    se_mw = uncurl.experiment_runner.PoissonSE(clusters=k, return_m=True)
    se_mw_2 = uncurl.experiment_runner.PoissonSE(clusters=2, return_m=True)

    # layer 1:
    t0 = time.time()
    print('starting recursive uncurl')
    m, w = run_partition(data_subset, 2, 8, se_mw_2, 2)
    print('time elapsed: {0}'.format(time.time() - t0))
    print('nmi: {0}'.format(nmi(w.argmax(0), true_labels1)))
Ejemplo n.º 16
0
        mw = torch.matmul(m, w)
        return mw.numpy()


if __name__ == '__main__':
    import uncurl
    from uncurl.state_estimation import objective
    from uncurl.preprocessing import cell_normalize, log1p
    import scipy.io
    from sklearn.cluster import KMeans
    from sklearn.metrics.cluster import normalized_mutual_info_score as nmi

    mat = scipy.io.loadmat('data/10x_pooled_400.mat')
    actual_labels = mat['labels'].squeeze()
    X = mat['data'].toarray().astype(np.float32)
    genes = uncurl.max_variance_genes(X, 5, 0.2)
    X_subset = X[genes, :]

    X_log_norm = log1p(cell_normalize(X_subset)).astype(np.float32)
    uncurl_net = UncurlNet(X_log_norm,
                           8,
                           use_reparam=False,
                           use_decoder=False,
                           use_batch_norm=True,
                           hidden_layers=2,
                           hidden_units=200,
                           loss='mse')
    m_init = torch.tensor(uncurl_net.M)

    uncurl_net.pre_train_encoder(None, lr=1e-3, n_epochs=20, log_interval=10)
    uncurl_net.train_model(None, lr=1e-3, n_epochs=50, log_interval=10)
Ejemplo n.º 17
0
import uncurl
from uncurl.sparse_utils import symmetric_kld
from uncurl.vis import visualize_dim_red

# note: this whole script should finish in under a few minutes.

if __name__ == '__main__':

    # 1. load data - 753 cells, 19971 genes
    dat = loadmat('data/GSE60361_dat.mat')
    data = dat['Dat']
    true_labels = dat['ActLabs'].flatten()
    data_csc = sparse.csc_matrix(data)

    # 2. gene selection
    genes = uncurl.max_variance_genes(data_csc, nbins=5, frac=0.2)
    data_subset = data_csc[genes, :]

    # 3. state estimation
    k = 7  # number of clusters to use
    M, W, ll = uncurl.poisson_estimate_state(data_subset, k)
    argmax_labels = W.argmax(0)

    # 4. visualization

    # mds visualization
    mds_proj = uncurl.mds(M, W, 2)
    visualize_dim_red(mds_proj,
                      true_labels,
                      'GSE60361_mds_true_labels.png',
                      title='MDS',
Ejemplo n.º 18
0
def generate_uncurl_analysis(data,
                             output_dir,
                             data_type='dense',
                             gene_names=None,
                             gene_sub=True,
                             **uncurl_kwargs):
    """
    Performs an uncurl analysis of the data, writing the results in the given
    directory.

    Outputs:
        output_dir/m.txt
        output_dir/w.txt
        output_dir/labels.txt (integer labels)
        output_dir/top_genes.txt (json of a dict mapping cluster ids to a list of (gene_id : c_score) sorted by c_score)
        output_dir/mds_means.txt (mds of the means)
        output_dir/mds_data.txt (mds projection of data)
        output_dir/gene_subset.txt (gene subset selected by uncurl)
        output_dir/gene_names.txt (list of all gene names in data subset)

    Args:
        data (array or str): either a data array, or a string containing
            the path to a data array..
        output_dir (str): directory to write output to.
        data_type (str): if data is a path, this indicates whether the data is a dense or sparse array.
        gene_names (list or array): list of all gene names
        gene_sub (bool): whether or not to use gene subset selection (max_variance_genes)
        **uncurl_kwargs: arguments to pass to uncurl.run_state_estimation. has to include clusters=k.
    """
    try:
        os.makedirs(output_dir)
    except:
        print('could not make output dir: {0}'.format(output_dir))
    if isinstance(data, str):
        if data_type == 'dense':
            data = np.loadtxt(data)
        elif data_type == 'sparse':
            data = scipy.io.mmread(data)
            data = sparse.csc_matrix(data)
    if isinstance(gene_names, str):
        gene_names = np.loadtxt(gene_names, dtype=str)
    # run uncurl
    if gene_sub:
        genes_subset = np.array(uncurl.max_variance_genes(data))
        np.savetxt(os.path.join(output_dir, 'gene_subset.txt'),
                   genes_subset,
                   fmt='%d')
        data = data[genes_subset, :]
        if gene_names is not None:
            gene_names = gene_names[genes_subset]
    print(uncurl_kwargs)
    m, w, ll = uncurl.run_state_estimation(data, **uncurl_kwargs)
    np.savetxt(os.path.join(output_dir, 'm.txt'), m)
    np.savetxt(os.path.join(output_dir, 'w.txt'), w)
    labels = w.argmax(0)
    np.savetxt(os.path.join(output_dir, 'labels.txt'), labels, fmt='%d')
    # find overexpressed genes for clusters
    top_genes = uncurl_analysis.find_overexpressed_genes(data, w.argmax(0))
    with open(os.path.join(output_dir, 'top_genes.txt'), 'w') as f:
        json.dump(top_genes, f)
    # run mds
    mds_output = uncurl.dim_reduce(m, w, 2)
    print(mds_output.shape)
    np.savetxt(os.path.join(output_dir, 'mds_means.txt'), mds_output.T)
    mds_data = uncurl.mds(m, w, 2)
    np.savetxt(os.path.join(output_dir, 'mds_data.txt'), mds_data)
    if gene_names is not None:
        np.savetxt(os.path.join(output_dir, 'gene_names.txt'),
                   gene_names,
                   fmt='%s')
Ejemplo n.º 19
0
    import os
    import pandas as pd
    import scipy.io
    from purity_analysis import plot_df, build_simple_table

    data_counts = pd.read_csv(
        '../uncurl_test_datasets/tasic_allen_brain_map/genes_counts.csv')
    X1 = data_counts.iloc[:, 1:].as_matrix()
    X1 = sparse.csc_matrix(X1)
    cell_classification = pd.read_csv(
        '../uncurl_test_datasets/tasic_allen_brain_map/cell_classification.csv'
    )
    actual_labels = cell_classification.primary

    k = 49
    genes = uncurl.max_variance_genes(X1, 5, 0.2)
    data_subset = X1[genes, :]

    log = uncurl.experiment_runner.Log()
    log_norm = uncurl.experiment_runner.LogNorm()
    uncurl_net_runner = UncurlNetRunner(k=k, loss='mse')
    uncurl_runner = experiment_runner.PoissonSE(clusters=k)
    uncurl_net_runner_2_hidden_layers = UncurlNetRunner(
        k=k, hidden_layers=2, loss='mse', output_names=['UncurlNetW_2_400'])
    uncurl_net_runner_2_hidden_layers_2 = UncurlNetRunner(
        k=k,
        hidden_layers=2,
        loss='mse',
        n_model_epochs=100,
        output_names=['UncurlNetW_2_400_100iters'])
    uncurl_net_runner_100_units = UncurlNetRunner(
        moves='birth,merge,shuffle',
        m_startLap=5,
        b_startLap=2,
        b_Kfresh=4)
    selected_k = info_dict['K_history'][-1]
    results = trained_model.calc_local_params(data_dense_bnpy)
    cluster_labels = results['resp'].argmax(1)
    return selected_k, cluster_labels


if __name__ == '__main__':
    import time
    # load/subset data
    data_mat = scipy.io.loadmat('../data/10x_pooled_400.mat')
    data = data_mat['data']
    gene_subset = uncurl.max_variance_genes(data)
    data_subset = data[gene_subset, :]

    # run bnpy clustering?
    true_labels = data_mat['labels'].flatten()
    t0 = time.time()
    selected_k, labels = bnpy_select_clusters(data_subset)
    print(selected_k)
    print('nmi: ' + str(nmi(true_labels, labels)))
    print('time: ' + str(time.time() - t0))

    data_mat_2 = scipy.io.loadmat('../../uncurl_python/data/SCDE_k2_sup.mat')
    data = data_mat_2['Dat']
    t0 = time.time()
    selected_k, labels = bnpy_select_clusters(data)
    true_labels = data_mat_2['Lab'].flatten()
Ejemplo n.º 21
0
    try:
        os.makedirs(dire_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise e


args = parse_args()
print("run with these parametres: %s" % str(args))

data = pd.read_csv(args.input, index_col=0)

if args.gene_subset == 'non_zero':
    genes_subset = np.sum(data.values, axis=1) != 0  # select nonzero genes
elif args.gene_subset == 'max_variance':
    genes_subset = max_variance_genes(data.values, nbins=5, frac=0.2) # select genes with max variance
else:
    raise NotImplementedError("optin `%s` for `gene_subset` not defined." % args.gene_subset)

data_subset = data.iloc[genes_subset,:]
M, W, ll = run_state_estimation(data_subset.values, clusters=args.clusters,
                                dist=args.dist, disp=True,
                                max_iters=args.max_iters,
                                inner_max_iters=args.inner_max_iters,
                                initialization=args.initialization,
                                threads=args.threads)

print("ll: %f" % ll)

data.iloc[genes_subset, :] = np.matmul(M, W) # imputation
Ejemplo n.º 22
0
            m = m.numpy()
            mw = m.dot(w)
            return [w, mw], 0
        else:
            return [w], 0

if __name__ == '__main__':
    import os
    import pandas as pd
    import scipy.io
    from purity_analysis import plot_df, build_simple_table
    data = scipy.io.mmread('../uncurl_test_datasets/10x_pure_pooled/data_8000_cells.mtx.gz')
    data = sparse.csc_matrix(data)
    actual_labels = np.loadtxt('../uncurl_test_datasets/10x_pure_pooled/labels_8000_cells.txt').astype(int).flatten()
    k = len(set(actual_labels))
    genes = uncurl.max_variance_genes(data, 5, 0.2)
    data_subset = data[genes, :]

    # TODO: add experiment for NMF
    log = uncurl.experiment_runner.Log()
    log_norm = uncurl.experiment_runner.LogNorm()
    uncurl_net_runner = UncurlNetRunner(k=k, loss='mse')
    uncurl_runner = experiment_runner.PoissonSE(clusters=k)
    uncurl_net_runner_2_hidden_layers = UncurlNetRunner(k=k, hidden_layers=2, loss='mse', output_names=['UncurlNetW_2_400'])
    uncurl_net_runner_2_hidden_layers_2 = UncurlNetRunner(k=k, hidden_layers=2, loss='mse', n_model_epochs=100, output_names=['UncurlNetW_2_400'])
    uncurl_net_runner_100_units = UncurlNetRunner(k=k, hidden_units=100, hidden_layers=2, loss='mse', output_names=['UncurlNetW_2_100'])

    vis_dir = '10x_8k_vis'
    try:
        os.makedirs(vis_dir)
    except:
Ejemplo n.º 23
0
def gene_subset_creator(data_normalized, cell_subset, **kwargs):
    data = data_normalized[:, cell_subset]
    gene_subset = uncurl.max_variance_genes(data, nbins=5, frac=kwargs['frac'])
    gene_subset = np.array(gene_subset)
    return gene_subset