Exemple #1
0
def get_sketch(adata,
               key,
               folds=10,
               how='pd',
               min_num_per_key=500,
               start='filter'):
    '''geometric sketching based on diffusion map and pca
    folds: folds to subsample
    min_num_per_key: minimun number to sample'''
    sketch_index = []
    for smp in set(adata.obs[key]):
        print(smp)
        c = adata.obs[key] == smp

        if start == 'filter':
            sdata = get_subset(adata, c)
        else:
            sdata = adata[c]
            sc.pp.filter_genes_dispersion(sdata)
            sc.pp.pca(sdata)
        sc.pp.neighbors(sdata)
        sc.tl.diffmap(sdata)

        N = np.max(
            [np.int(np.sum(c) / folds),
             np.min([min_num_per_key, np.sum(c)])])
        print(N)
        if how == 'pd':
            set1 = set(sdata.obs_names[gs(sdata.obsm['X_diffmap'],
                                          N,
                                          replace=True)])
            set2 = set(sdata.obs_names[gs(sdata.obsm['X_pca'][:, :50],
                                          N,
                                          replace=True)])
            sketch_index.extend(list(set1.union(set2)))
        elif how == 'p':
            set2 = set(sdata.obs_names[gs(sdata.obsm['X_pca'][:, :50],
                                          N,
                                          replace=True)])
            sketch_index.extend(list(set2))
        elif how == 'd':
            set1 = set(sdata.obs_names[gs(sdata.obsm['X_diffmap'][:, :20],
                                          N,
                                          replace=True)])
            sketch_index.extend(list(set1))
        else:
            raise SystemError
    return (sketch_index)
Exemple #2
0
def runGeoSketch(adata,N=10000,use_rep="X_pca"):
	from geosketch import gs
	sc.tl.pca(adata)
	sketch_index = gs(adata.obsm[use_rep], N, replace=False)
	adata.uns['geosketch']=adata.obs.index[sketch_index]
	subdata = adata[adata.obs.index[sketch_index]]
	return subdata
    def subsample(self, counts: pd.DataFrame) -> pd.DataFrame:
        input_genes = counts.shape[1]

        if self.num_cells is None:
            self.num_cells = int(input_genes / 3)

        core_logger.info('Subsampling {} to {}'.format(input_genes, self.num_cells))

        counts_t = counts.T

        if self.log:
            pca_input = np.log1p(counts_t)
        else:
            pca_input = counts_t

        try:
            u, s, vt = pca(pca_input.values, k=self.num_pc)
            x_dimred = u[:, :self.num_pc] * s[:self.num_pc]
            sketch_index = gs(x_dimred, self.num_cells, replace=False)
            x_matrix = counts_t.iloc[sketch_index]
        except Exception as e:
            core_logger.warning('Subsampling failed: ignored.')
            if self.verbose:
                core_logger.warning(str(e))
            return counts

        core_logger.info('Done subsampling {} to {}'.format(input_genes, self.num_cells))

        return x_matrix.T
    def fit(self, X, y):
        if X.shape[0] > self.n_inducing_:
            if self.method_ == 'uniform':
                uni_idx = np.random.choice(X.shape[0], self.n_inducing_,
                                           replace=False)
                X_sketch = X[uni_idx]
                y_sketch = y[uni_idx]

            elif self.method_ == 'geosketch':
                from fbpca import pca
                from geosketch import gs

                U, s, _ = pca(X, k=100)
                X_dimred = U[:, :100] * s[:100]
                gs_idx = gs(X_dimred, self.n_inducing_, replace=False)
                X_sketch = X[gs_idx]
                y_sketch = y[gs_idx]

        else:
            X_sketch, y_sketch = X, y

        self.gpr_ = GPRegressor(
            n_restarts=self.n_restarts_,
            kernel=self.kernel_,
            backend=self.backend_,
            batch_size=self.batch_size_,
            n_jobs=self.n_jobs_,
            verbose=self.verbose_,
        ).fit(X_sketch, y_sketch)
 def train_clf_helper(X_tr, y_tr, X_te, y_te):
     if train_size < X_tr.shape[0]:
         sketch_idx = gs(X_tr, train_size, replace=False)
         X_tr = X_tr[sketch_idx]
         y_tr = y_tr[sketch_idx]
     # clf = KNeighborsClassifier()
     clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
     clf.fit(X_tr, y_tr)
     y_pred = clf.predict(X_te)
     print(classification_report(y_te, y_pred))
     print()
     return classification_report(y_te, y_pred, output_dict=True)
Exemple #6
0
def get_expression(adata, use_raw=True, use_geosketch=False, geosketch_N=10000, geosketch_transform=None):
    """Grab expression and put into pandas dataframe."""
    if use_raw:
        ad = adata.raw
    else:
        ad = adata

    if isinstance(ad.X, csr_matrix):
        df = pd.DataFrame(ad.X.toarray(), index=ad.obs_names, columns=ad.var_names)
    else:
        df = pd.DataFrame(ad.X, index=ad.obs_names, columns=ad.var_names)

    if use_geosketch:
        if geosketch_transform==None:
            sc.tl.umap(adata, n_components=2)
            geosketch_transform = "umap"
        sketch_index = gs(adata.obsm[("X_" + geosketch_transform)], geosketch_N, replace=False)
        df = df.loc[sketch_index]
        
    return df.transpose()
Exemple #7
0
    def sketch(self, X):
        """
        Actually sketches the dataset and saves nearest neighbor mappings
        from sketch elements to sample observations in full dataset in
        the `self.sketch_neighbors` variable.

        Parameters
        ----------
        X: `numpy.ndarray` or `scipy.sparse.csr_matrix`
            Dataset tot be sketched.

        Returns
        -------
        X_sketch
            Sketched version of dataset `X`.
        """
        n_samples = X.shape[0]

        if self.verbose:
            tprint('Sketching...')

        if self.sketch_method == 'geometric':
            from geosketch import gs
            sketch_idx = gs(X, self.sketch_size, replace=False)
        elif self.sketch_method == 'uniform':
            sketch_idx = sorted(
                np.random.choice(n_samples,
                                 size=self.sketch_size,
                                 replace=False))
        else:
            return X

        X_sketch = X[sketch_idx]

        self.sketch_neighbors = nearest_approx(X, X_sketch)

        return X[sketch_idx]
Exemple #8
0
def multi_sketch(dimRed, fractions, clusters):
    """
    Do geometric sketches of the data given in dimRed, one per fraction
    """
    total_cells = dimRed.shape[0]
    percentages = ["pct" + str(int(i*100)) for i in fractions]
    all_counts = Counter(clusters)
    cluster_names = [str(i) for i in sorted([int(float(i)) for i in list(set(clusters))])]
    sketch_index_by_percentage = {}
    sketch_N = []

    sketch_df = pd.DataFrame(columns = percentages + ['full'], index = cluster_names)
    for key, value in all_counts.items():
        sketch_df.loc[key, 'full'] = value
    for i, fraction in enumerate(fractions):
        N = ceil(total_cells * fraction)
        print("total number of cells: ", total_cells, "; fraction: ", fraction, "; fraction # cells: ", N)
        sketch_N = sketch_N + [N]
        this_sketch_index = gs(dimRed, N, replace=False)
        sketch_index_by_percentage[percentages[i]] = this_sketch_index
        subset_counts = Counter(clusters.iloc[this_sketch_index])
        for key, value in subset_counts.items():
            sketch_df.loc[key, percentages[i]] = value
    return sketch_df, sketch_index_by_percentage, sketch_N
    datasets, genes = merge_datasets(datasets, genes_list)
    X = vstack(datasets)

    if not os.path.isfile('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE)):
        log('Dimension reduction with {}...'.format(METHOD))
        X_dimred = reduce_dimensionality(normalize(X),
                                         method=METHOD,
                                         dimred=DIMRED)
        log('Dimensionality = {}'.format(X_dimred.shape[1]))
        np.savetxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE), X_dimred)
    else:
        X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(
            METHOD, NAMESPACE))

    from geosketch import gs, uniform
    samp_idx = gs(X_dimred, 20000, replace=False)
    #samp_idx = uniform(X_dimred, 20000, replace=False)

    #from anndata import AnnData
    #import scanpy.api as sc
    #adata = AnnData(X=X_dimred[samp_idx, :])
    #sc.pp.neighbors(adata, use_rep='X')
    #sc.tl.louvain(adata, resolution=1.5, key_added='louvain')
    #
    #louv_labels = np.array(adata.obs['louvain'].tolist())
    #le = LabelEncoder().fit(louv_labels)
    #cell_labels = le.transform(louv_labels)
    #
    #np.savetxt('data/cell_labels/mouse_brain_louvain.txt', cell_labels)

    cell_labels = (open('data/cell_labels/mouse_brain_louvain.txt').read().
def geosketch_sample_dimred(X, n):
    U, s, Vt = pca(X, k=100)  # E.g., 100 PCs.
    X_dimred = U[:, :100] * s[:100]
    sketch_index = gs(X_dimred, n, replace=False)
    return X_dimred[sketch_index]
Exemple #11
0
def correlate_tf_motifs(
    adata: AnnData,
    n_sketch: Optional[int] = 2500,
    n_permutations: Optional[int] = 100000,
    indirect: Optional[bool] = True,
) -> None:
    """Correlate inferred motif activity with TF expression.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    n_sketch : `int`, optional (default: 2500)
        If the number of cells is higher than `n_sketch`, use geometric
        sketching (Hie et al. 2019) to select a subset of `n_sketch`
        cells. This subset will be used to calculate the correlation beteen
        motif activity and transcription factor expression.
    n_permutations : `int`, optional (default: 100000)
        Number of permutations that is used to calculate the p-value. Can be
        decreased for quicker run-time, but should probably not be below 10000.
    indirect : `bool`, optional (default: True)
        Include indirect TF to motif assignments.
    """
    logger.info("correlating motif activity with factors")
    if indirect:
        logger.info("including indirect and/or predicted factors")
    # Get all TFs from motif database
    m2f = motif_mapping(indirect=True)
    batch_size = m2f.shape[0]
    f2m2 = pd.DataFrame(m2f["factors"].str.split(",").tolist(),
                        index=m2f.index).stack()
    f2m2 = f2m2.to_frame().reset_index().iloc[:, [0, 2]]
    f2m2.columns = ["motif", "factor"]
    unique_factors = f2m2["factor"].unique()

    if n_sketch is None or n_sketch > adata.shape[0]:
        logger.info(f"using all cells")
        my_adata = adata
    else:
        logger.info(f"creating sketch of {n_sketch} cells")
        idx = geosketch.gs(adata.obsm["X_pca"], n_sketch)
        my_adata = adata.copy()
        my_adata = my_adata[idx]

    detected = (my_adata.raw.var_names.str.upper().isin(unique_factors)) & (
        (my_adata.raw.X > 0).sum(0) > 3)
    detected = np.squeeze(np.asarray(detected))
    unique_factors = my_adata.raw.var_names[detected].str.upper()

    # Get the expression for all TFs
    expression = (np.squeeze(np.asarray(my_adata.raw.X.todense()))
                  if issparse(my_adata.raw.X) else my_adata.raw.X)
    expression = expression.T[detected]

    logger.info(
        f"calculating correlation of motif activity with {len(unique_factors)} factors"
    )
    real = fast_corr(
        expression,
        (my_adata.obsm["X_cell_types"]
         @ my_adata.uns["scepia"]["motif_activity"].T).T.values,
    )
    real = pd.DataFrame(
        real,
        index=unique_factors,
        columns=my_adata.uns["scepia"]["motif_activity"].index,
    )

    tmp = (real.reset_index().melt(
        id_vars="index", var_name="motif",
        value_name="correlation").rename(columns={
            "index": "factor"
        }).set_index(["motif", "factor"]))
    f2m2 = f2m2.set_index(["motif", "factor"]).join(tmp).dropna()
    f2m2["abs_correlation"] = f2m2["correlation"].abs()

    logger.info(f"calculating {n_permutations} permutations")
    permute_result = pd.DataFrame(index=unique_factors)
    shape = my_adata.uns["scepia"]["motif_activity"].shape
    for i in tqdm(range(0, n_permutations, batch_size)):
        random_activities = None
        while random_activities is None or random_activities.shape[
                0] < batch_size:
            x = my_adata.uns["scepia"]["motif_activity"].values.flatten()
            motif_activity = shuffle(x).reshape(shape[1], shape[0])
            cell_motif_activity = (
                my_adata.obsm["X_cell_types"] @ motif_activity).T
            if random_activities is None:
                random_activities = cell_motif_activity
            else:
                random_activities = np.vstack(
                    (random_activities, cell_motif_activity))

        random_activities = random_activities[:batch_size]
        batch_result = fast_corr(expression, random_activities)
        batch_result = pd.DataFrame(batch_result,
                                    index=unique_factors,
                                    columns=range(i, i + batch_size))
        permute_result = permute_result.join(batch_result)

    logger.info("calculating permutation-based p-values (all)")

    # Calculate p-value of correlation relative to all permuted correlations
    permuted_corrs = permute_result.values.flatten()
    pvals = [(100 - percentileofscore(permuted_corrs, corr)) / 100
             for corr in f2m2["correlation"]]
    f2m2["pval"] = pvals
    f2m2.loc[f2m2["correlation"] < 0,
             "pval"] = (1 - f2m2.loc[f2m2["correlation"] < 0, "pval"])
    logger.info("calculating permutation-based p-values (factor-specific)")

    # Calculate p-value of correlation relative to permutated value of this factor
    for motif, factor in tqdm(f2m2.index):
        pval = (100 - percentileofscore(permute_result.loc[factor],
                                        real.loc[factor, motif])) / 100
        pval = 1 - pval if real.loc[factor, motif] < 0 else pval
        pval = 1 / permute_result.shape[1] if pval == 0 else pval
        f2m2.loc[(motif, factor), "permutation_pval"] = pval
        f2m2.loc[(motif, factor), "combined"] = combine_pvalues(
            f2m2.loc[(motif, factor), ["pval", "permutation_pval"]])[1]

    f2m2["p_adj"] = multipletests(f2m2["combined"], method="fdr_bh")[1]
    f2m2["-log10(p-value)"] = -np.log10(f2m2["p_adj"])

    cluster_cell_types = adata.obs["cluster_annotation"].unique()
    f2m2 = f2m2.join(
        (adata.uns["scepia"]["motif_activity"][cluster_cell_types].max(1) -
         adata.uns["scepia"]["motif_activity"][cluster_cell_types].min(1)
         ).to_frame("motif_stddev").rename_axis("motif"))

    f2m2 = f2m2.reset_index().set_index("factor")
    adata.uns["scepia"]["correlation"] = f2m2
    def fit(self, X, y):
        if self.sketch_size_ is not None:
            from fbpca import pca
            from geosketch import gs

            if X.shape[1] > 100:
                U, s, _ = pca(X, k=100)
                X_dimred = U * s
            else:
                X_dimred = X
            gs_idx = gs(X_dimred, self.sketch_size_, replace=False)
            X = X[gs_idx]
            y = y[gs_idx]

        n_samples, n_features = X.shape

        if self.verbose_:
            tprint(
                'Fitting Bayesian NN on {} data points with dimension {}...'.
                format(*X.shape))

        X = X.astype(np.float32)  # Edward uses float32.

        # Bayesian weights.

        W_0_shape = [n_features, self.n_hidden1_]
        W_0 = Normal(loc=tf.zeros(W_0_shape), scale=tf.ones(W_0_shape))

        W_1_shape = [self.n_hidden1_, self.n_hidden2_]
        W_1 = Normal(loc=tf.zeros(W_1_shape), scale=tf.ones(W_1_shape))

        W_2_shape = [self.n_hidden2_, y.shape[1]]
        W_2 = Normal(loc=tf.zeros(W_2_shape), scale=tf.ones(W_2_shape))

        # Bayesian biases.

        b_0 = Normal(loc=tf.zeros(self.n_hidden1_),
                     scale=tf.ones(self.n_hidden1_))
        b_1 = Normal(loc=tf.zeros(self.n_hidden2_),
                     scale=tf.ones(self.n_hidden2_))
        b_2 = Normal(loc=tf.zeros(y.shape[1]), scale=tf.ones(y.shape[1]))

        # Approximating distributions for KL divergence
        # variational inference.

        qW_0 = Normal(loc=tf.get_variable("qW_0/loc", W_0_shape),
                      scale=tf.nn.softplus(
                          tf.get_variable("qW_0/scale", W_0_shape)))
        qW_1 = Normal(loc=tf.get_variable("qW_1/loc", W_1_shape),
                      scale=tf.nn.softplus(
                          tf.get_variable("qW_1/scale", W_1_shape)))
        qW_2 = Normal(loc=tf.get_variable("qW_2/loc", W_2_shape),
                      scale=tf.nn.softplus(
                          tf.get_variable("qW_2/scale", W_2_shape)))
        qb_0 = Normal(loc=tf.get_variable("qb_0/loc", [self.n_hidden1_]),
                      scale=tf.nn.softplus(
                          tf.get_variable("qb_0/scale", [self.n_hidden1_])))
        qb_1 = Normal(loc=tf.get_variable("qb_1/loc", [self.n_hidden2_]),
                      scale=tf.nn.softplus(
                          tf.get_variable("qb_1/scale", [self.n_hidden2_])))
        qb_2 = Normal(loc=tf.get_variable("qb_2/loc", [y.shape[1]]),
                      scale=tf.nn.softplus(
                          tf.get_variable("qb_2/scale", [y.shape[1]])))

        # Fit model.

        X_variational = tf.placeholder(tf.float32, [n_samples, n_features],
                                       name='X')

        y_variational = Normal(loc=neural_network(X, W_0, W_1, W_2, b_0, b_1,
                                                  b_2),
                               scale=tf.ones((n_samples, y.shape[1])))

        inference = ed.KLqp(
            {
                W_0: qW_0,
                b_0: qb_0,
                W_1: qW_1,
                b_1: qb_1,
                W_2: qW_2,
                b_2: qb_2,
            },
            data={
                X_variational: X,
                y_variational: y
            })

        self.sess_ = ed.get_session()
        tf.global_variables_initializer().run()

        inference.run(n_iter=self.n_iter_, n_samples=10)

        self.model_ = {
            'qW_0': qW_0,
            'qb_0': qb_0,
            'qW_1': qW_1,
            'qb_1': qb_1,
            'qW_2': qW_2,
            'qb_2': qb_2,
        }

        if self.verbose_:
            tprint('Done fitting Bayesian NN model.')

        return self
Exemple #13
0
        NAMESPACE,
        n_seeds=4,
        #cell_labels=cell_labels,
        #cell_exp_ratio=True,
        #louvain_ami=True,
        #rare=True,
        #rare_label=le.transform(['Macrophage'])[0],
        max_min_dist=True,
    )
    exit()

    report_cluster_counts(labels)

    from differential_entropies import differential_entropies
    differential_entropies(X_dimred, labels)

    plot_rare(X_dimred,
              cell_labels,
              le.transform(['Macrophage'])[0],
              NAMESPACE,
              n_seeds=4)

    from geosketch import gs
    samp_idx = gs(X_dimred, 1000, replace=False)
    save_sketch(X, samp_idx, genes, NAMESPACE + '1000')

    for scale in [10, 25, 100]:
        N = int(X.shape[0] / scale)
        samp_idx = gs(X_dimred, N, replace=False)
        save_sketch(X, samp_idx, genes, NAMESPACE + str(N))
Exemple #14
0
            cell_names += ['mcsf_day6'] * a.shape[0]
        else:
            assert (False)
    le = LabelEncoder().fit(cell_names)
    cell_labels = le.transform(cell_names)

    write_table(X.toarray(), genes, 'data/pseudotime/' + NAMESPACE)

    with open('data/pseudotime/mono_macro_meta.txt', 'w') as of:
        of.write('Label\n')
        for idx in range(X.shape[0]):
            of.write('mono_macro{}\t{}'.format(idx, cell_names[idx]))

    from geosketch import gs, gs_gap, uniform

    gs_idx = gs(X_dimred, 110, replace=False)
    write_table(X[gs_idx, :].toarray(), genes,
                'data/pseudotime/' + NAMESPACE + '_gs')
    report_cluster_counts(cell_labels[gs_idx])

    with open('data/pseudotime/mono_macro_meta_gs.txt', 'w') as of:
        of.write('Label\n')
        i = 0
        for idx in range(X.shape[0]):
            if idx not in gs_idx:
                continue
            of.write('mono_macro_gs{}\t{}\n'.format(i, cell_names[idx]))
            i += 1

    uni_idx = uniform(X_dimred, 110, replace=False)
    write_table(X[uni_idx, :].toarray(), genes,
Exemple #15
0
    # remove unnecessary fields
    matt = mat.copy()
    matt.drop("batch", axis=1, inplace=True)
    matt.drop("type", axis=1, inplace=True)
    matt.drop("cluster", axis=1, inplace=True)
    matt.drop("tissue.cancer", axis=1, inplace=True)
    matt.set_index('dataset', inplace=True)
    mattm = matt.values

    # compute the PCs - necessary input for the sketching
    U, s, Vt = pca(mattm, k=100)
    X_dimred = U[:, :100] * s[:100]

    # sketch
    N = int(N_samples * sk_sz)  # Number of samples to obtain from the dataset
    sketch_index = gs(X_dimred, N, replace=False)
    X_sketch = X_dimred[sketch_index]

    # get the samples selected in the sketch and output
    reduced = pd.DataFrame(X_sketch)
    pca_out = pd.DataFrame(X_dimred)
    pca_out["dataset"] = list(matt.index)
    red_with_labs = pd.merge(pca_out,
                             reduced,
                             how="inner",
                             on=list(reduced.columns.values))
    selected = list(red_with_labs["dataset"])

    out = open(
        currdir + "../results/sketches/" + can.lower().replace(" ", "-") +
        "-sketch.txt", "w")