Beispiel #1
0
def plot_HalfNorm2():
    #X = np.random.normal(-2, 1, size=(1000, 1))  ## scipy runtime warning (possibly due to running outdated version)
    X = np.random.sample(size=(1000, 1))
    X[::2] += 4
    modeln = GeneralMixtureModel.from_samples(NormalDistribution, 2, X)
    modelh = GeneralMixtureModel.from_samples(HalfNormalDistribution, 2, X)
    x = np.arange(-15, 15, 0.1)
    fig, ax = plt.subplots(figsize=(7, 4))
    ax.plot(x, modeln.probability(x), label='Normal Mixture')
    ax.plot(x, set_y(x, modelh), label='Half Norm Mixture')
    ax.set_ylabel('Probability', fontsize=10)
    ax.legend(fontsize=10)
    plt.savefig('/scratch/chd5n/test.png', bbox_inches='tight')
    print('plot written to', '/scratch/chd5n/test.png')
Beispiel #2
0
    def addData(self, data, score):
        score = score.clip(min=1e-5)
        self.data = data
        self.score = score

        score_normed = self.score / np.linalg.norm(self.score, ord=1)
        try:
            model = GeneralMixtureModel.from_samples(
                MultivariateGaussianDistribution,
                n_components=self.n_comp,
                X=self.data,
                weights=score_normed)
            self.model = model
        except:
            logging.info("catched an exception")
Beispiel #3
0
    def fit_mixture(self,
                    pos_left,
                    pos_right,
                    weights,
                    n_components=2,
                    tol=1e-4,
                    maxiter=4000,
                    verbose=False):
        left, right = np.asarray(pos_left), np.asarray(pos_right)
        weights = np.asarray(weights)
        debugs = list() if verbose else None
        centers = (left + right) / 2.0
        init_gmm = GeneralMixtureModel.from_samples(
            MultivariateGaussianDistribution,
            n_components=n_components,
            X=centers,
            weights=weights,
            stop_threshold=0.01,
            n_jobs=2)

        init_mus, init_covs = list(), list()
        init_comp_ws = np.array(init_gmm.weights)
        init_comp_ws /= np.sum(init_comp_ws)
        for i in range(n_components):
            paras = init_gmm.distributions[i].parameters
            init_mus.append(np.array(paras[0]))
            init_covs.append(np.array(paras[1]))

        init_paras = self._paras_compose_(init_mus, init_covs,
                                          list(init_comp_ws))

        method = 'Nelder-Mead'
        res = opt.minimize(self._mixture_optpara,
                           init_paras,
                           args=(left, right, weights, n_components, debugs),
                           method=method,
                           tol=tol,
                           options={
                               'maxiter': maxiter,
                               'disp': verbose
                           })
        if verbose:
            print("Method:{}; Initial parameter: {};".format(
                method, init_paras))
            print("Converged Parameter: {}".format(res.x))

        mus, covs, comp_ws = self._paras_decompose_(res.x, n_components)
        return mus, covs, comp_ws, res.fun
Beispiel #4
0
def fit_mixture_model(counts):
    ''' Code adapted from https://github.com/josephreplogle/guide_calling '''

    data = np.log2(counts + 1)

    reshaped_data = data.reshape(-1, 1)

    xs = np.linspace(-2, max(data) + 2, 1000)

    # Re-fit the model until it has converged with both components given non-zero weight
    # and the Poisson component in the first position with lower mean.

    while True:
        model = GeneralMixtureModel.from_samples(
            [PoissonDistribution, NormalDistribution], 2, reshaped_data)

        if 0 in model.weights:
            # One component was eliminated
            continue
        elif np.isnan(model.probability(xs)).any():
            continue
        elif model.distributions[0].parameters[0] > model.distributions[
                1].parameters[0]:
            continue
        elif model.distributions[0].name != 'PoissonDistribution':
            continue
        else:
            break

    labels = model.predict(reshaped_data)

    xs = np.linspace(0, max(data) + 2, 1000)
    p_second_component = model.predict_proba(xs.reshape(-1, 1))[:, 1]
    threshold = 2**xs[np.argmax(p_second_component >= 0.5)]

    return labels, threshold
time_list = [100, 500, 900, 1500]
for time in time_list:
    samples = hiker_paths.get_all_at_time(time)

    weights = p_filter.weighting_func(log_weights)
    print(weights)
    print(samples)  # NormalDistribution

    samples = [[float(item[0]), float(item[1])] for item in samples]

    test = np.random.multivariate_normal([50, 50], [[1, 0], [0, 1]], 10)

    print(test)

    gmm = GeneralMixtureModel.from_samples(MultivariateGaussianDistribution,
                                           n_components=4,
                                           X=samples,
                                           weights=weights)

    clf = pomegranate_to_scikitlearn(gmm)

    graph_shape = depth_dict["data"].shape
    print(graph_shape)

    # display predicted scores by the model as a contour plot

    ax = plt.subplot(111)

    x = np.linspace(0.0, graph_shape[0])
    y = np.linspace(0.0, graph_shape[1])
    X, Y = np.meshgrid(x, y)
    XX = np.array([X.ravel(), Y.ravel()]).T
Beispiel #6
0
# print("Naive Bayes - Semisupervised Learning Accuracy: {}".format((model_b.predict(x_val) == y_val).mean()))

model_c = BayesClassifier.from_samples(MultivariateGaussianDistribution,
                                       x_train,
                                       y_train,
                                       inertia=0.0,
                                       pseudocount=0.0,
                                       stop_threshold=0.1,
                                       max_iterations=100,
                                       verbose=True,
                                       n_jobs=1)
print("Bayes Classifier - Semisupervised Learning Accuracy: {}".format(
    (model_c.predict(x_val) == y_val).mean()))

# general mixture model
d0 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 0])
d1 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 1])
d2 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 2])
d3 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 3])
d4 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 4])
d5 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 5])
d6 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 6])
d7 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
                                      x_train[y_train == 7])
d8 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
Beispiel #7
0
    def _initDists(self, X, distribution=MultivariateGaussianDistribution):
        technique = "R_MV-GMM"  # mixture of multivariate gaussain distribution
        if (technique == "GMM"):
            # gaussian mixture model
            #// uvgd = NormalDistribution.from_samples(X)
            #// gmm = GeneralMixtureModel([uvgd.copy() for _ in range(self.nmix)])
            gmm = GeneralMixtureModel.from_samples(
                distributions=[NormalDistribution for _ in range(self.nmix)],
                X=X)
            dists = [gmm.copy() for _ in range(self.statesNumber)]
        elif (technique == "MV-GMM"):
            # multivariate gaussian mixture model
            #// mvgd = MultivariateGaussianDistribution.from_samples(X)
            #// gmm = GeneralMixtureModel([mvgd.copy() for _ in range(self.nmix)])
            gmm = GeneralMixtureModel.from_samples(distributions=[
                MultivariateGaussianDistribution for _ in range(self.nmix)
            ],
                                                   X=X,
                                                   n_components=3)
            dists = [gmm.copy() for _ in range(self.statesNumber)]
        elif (technique == "MVG"):
            self._initkmeans(X=X, numClasses=self.statesNumber)
            dists = [
                MultivariateGaussianDistribution.from_samples(X=X[y == i])
                for i in range(self.statesNumber)
            ]
        elif (technique == "R_GMM"):
            # random gaussian mixture model
            randNormal = lambda: NormalDistribution(np.random.randint(1, 10), 1
                                                    )
            randGMM = lambda: GeneralMixtureModel(
                [randNormal() for _ in range(self.nmix)])
            dists = [randGMM() for _ in range(self.statesNumber)]
        elif (technique == "R_MV-GMM"):
            # random multivariate gaussian mixture model
            randGMM = lambda: GeneralMixtureModel(
                [randMVG() for _ in range(self.nmix)])
            dists = [randGMM() for _ in range(self.statesNumber)]
        return dists

        #* not completed:
        #! GMM-HMM-k
        y = self._initkmeans(X, self.statesNumber)
        # list(map(print, y))
        return [
            GeneralMixtureModel.from_samples(distribution,
                                             X=X[y == i],
                                             n_components=self.nmix)
            for i in range(self.statesNumber)
        ]

        #! Kmeans init
        if not isinstance(X, BaseGenerator):
            data_generator = SequenceGenerator(X, None, None)
        else:
            data_generator = X

        initialization_batch_size = len(data_generator)

        X_ = []
        data = data_generator.batches()
        for i in range(initialization_batch_size):
            batch = next(data)
            X_.extend(batch[0])

        X_concat = np.concatenate(X_)
        if X_concat.ndim == 1:
            X_concat = X_concat.reshape(X_concat.shape[0], 1)
        n, d = X_concat.shape
        clf = Kmeans(self.statesNumber, init="kmeans++",
                     n_init=1)  # init should be one of
        clf.fit(X_concat, max_iterations=None, batches_per_epoch=None)
        y = clf.predict(X_concat)
        if callable(distribution):
            if d == 1:
                dists = [
                    distribution.from_samples(X_concat[y == i][:, 0])
                    for i in range(self.statesNumber)
                ]
            elif distribution.blank().d > 1:
                dists = [
                    distribution.from_samples(X_concat[y == i])
                    for i in range(self.statesNumber)
                ]
            else:
                print("error")
        return dists
Beispiel #8
0
from pomegranate import (
    NaiveBayes,
    NormalDistribution,
    UniformDistribution,
    ExponentialDistribution,
    GeneralMixtureModel,
    MultivariateGaussianDistribution,
    BernoulliDistribution,
)
import pandas as pd
import numpy as np

X = pd.DataFrame({"A": [1, 0, 1, 0, 1], "B": [1, 1, 1, 1, 0]})

x = BernoulliDistribution(0.4)

vals = []
[vals.append(x.sample()) for i in range(1000)]

model = NaiveBayes([
    NormalDistribution(5, 2),
    UniformDistribution(0, 10),
    ExponentialDistribution(1.0)
])
model.predict(np.array([[10]]))

model = GeneralMixtureModel.from_samples(MultivariateGaussianDistribution,
                                         n_components=3,
                                         X=X)
Beispiel #9
0
def generate_guide_rna_prediction(
        loom,
        guide_rnas,
        nguide_ca='nGuide',
        nguide_reads_ca='nGuideReads',
        cell_prediction_summary_ca='CellGuidePrediction',
        overwrite=False,
        only_generate_log2=False,
        ncell_threshold_for_guide=10,
        nguide_threshold_for_cell=10):
    """
    This approach is inspired by Replogle et a. 2018 (https://doi.org/10.1038/s41587-020-0470-y). However, instead of a Gaussian/Poisson mixture, this routine uses a Poisson/Poisson mixture. 
    This routine uses the pomegranate package (https://github.com/jmschrei/pomegranate). 

    Parameters
    ----------
    loom : LoomConnection
        A LoomConnection object upon which guide rna predictions will be made
        
    guide_rnas : iterable of strings
        a list or other iterable of the strings, each corresponding to a column attribute of `loom` indicate the raw counts of a given guide RNA over cells 
    nguide_ca : str
         QC metric, indicating the name of the column attribute to use to indicate the number of predicted guide RNAs for a cell (Default value = 'nGuide')
    nguide_reads_ca :
         QC metric, indicating the name of the column attribute to use to indicate the total number of guide RNA reads for a cell(Default value = 'nGuideReads')
    cell_prediction_summary_ca : str
         Indicates the name of the column attribute to use to indicate a summary of positively-predicted guide RNAs for a cell(Default value = 'CellGuidePrediction')
    overwrite : bool
         If False, will raise exception if requested column attributes have already been written.  If True, will overwrite existing column attributes. (Default value = False)
    only_generate_log2 : bool
         If true, will generate log2 guide RNA counts, but will not apply any mixture model prediction. (Default value = False)
    ncell_threshold_for_guide : int
         Threshold for the number of cells wherein guide should have nonzero counts for mixture model to attempt prediction. (Default value = 10)
    nguide_threshold_for_cell : int
         Threshold for the number of guides to be detected in a given cell to attempt to make a prediction for that particular cell. (Default value = 10)

    Returns
    -------

    """
    from panopticon.utilities import import_check
    exit_code = import_check("pomegranate",
                             'conda install -c anaconda pomegranate')
    if exit_code != 0:
        return
    import pandas as pd

    if nguide_reads_ca in loom.ca.keys() and overwrite == False:
        raise Exception(
            "{} already in loom.ca.keys(); if intended, set overwrite argument to True"
            .format(nguide_reads_ca))

    guide_rna_dfs = []
    for guide_rna in guide_rnas:
        guide_rna_dfs.append(
            pd.DataFrame(loom.ca[guide_rna], columns=[guide_rna], copy=True))
    guide_rna_dfs = pd.concat(guide_rna_dfs, axis=1)
    loom.ca[nguide_reads_ca] = guide_rna_dfs.sum(axis=1).values
    threshold_for_cell_mask = loom.ca[
        nguide_reads_ca] >= nguide_threshold_for_cell
    prediction_ca_names = []
    for guide_rna in guide_rnas:
        if guide_rna not in loom.ca.keys():
            raise Exception(
                "raw_antibody_count_df must be prepared such that columns match column attributes in loom corresponding to raw antibody conjugate counts"
            )

        new_ca_name = guide_rna + '_log2'
        if new_ca_name in loom.ca.keys() and overwrite == False:
            raise Exception(
                "{} already in loom.ca.keys(); rename guide column attribute and re-run, or set overwrite argument to True"
                .format(new_ca_name))

        loom.ca[new_ca_name] = np.log2(loom.ca[guide_rna])
        if not only_generate_log2:
            from pomegranate import GeneralMixtureModel, PoissonDistribution

            prediction_ca_name = guide_rna + '_prediction'
            prediction_ca_names.append(prediction_ca_name)
            if prediction_ca_name in loom.ca.keys() and overwrite == False:
                raise Exception(
                    "{} already in loom.ca.keys(); rename guide rna column attribute and re-run, or set overwrite argument to True"
                    .format(prediction_ca_name))
            if (~np.isfinite(loom.ca[new_ca_name])).sum() > 0:
                cellmask = np.isfinite(loom.ca[new_ca_name])
                if cellmask.sum(
                ) >= ncell_threshold_for_guide:  # have minimum cells for guide
                    model = GeneralMixtureModel.from_samples(
                        [PoissonDistribution, PoissonDistribution],
                        n_components=2,
                        X=loom.ca[new_ca_name][cellmask.nonzero()[0]].reshape(
                            -1, 1))
                    predictions = []
                    for val in loom.ca[new_ca_name]:
                        if not np.isfinite(val):
                            predictions.append(np.nan)
                        else:
                            predictions.append(
                                model.predict(np.array(val).reshape(-1, 1))[0])
                else:
                    predictions = [0] * loom.shape[1]
                predictions = np.array(predictions)

            else:
                #print(guide_rna, loom.ca[guide_rna].sum())
                # print('Warning:  pomegrante Poisson/Normal mixture model has predicted a Poisson component with greater log(UMI+1) counts than normal component.  This is unusual behavior!')
                model = GeneralMixtureModel.from_samples(
                    [PoissonDistribution, PoissonDistribution],
                    n_components=2,
                    X=loom.ca[new_ca_name].reshape(-1, 1))
                #model.fit(loom.ca[new_ca_name].reshape(-1, 1))
                predictions = model.predict(loom.ca[new_ca_name].reshape(
                    -1, 1))

            if loom.ca[new_ca_name][np.array(predictions) == 0].mean(
            ) > loom.ca[new_ca_name][np.array(predictions) == 1].mean():
                predictions = 1 - predictions
            predictions = np.array(predictions)
            predictions = np.nan_to_num(predictions, nan=0.0)
            predictions *= threshold_for_cell_mask
            loom.ca[prediction_ca_name] = predictions

    guide_prediction_dfs = []
    for prediction_ca_name in prediction_ca_names:
        guide_prediction_dfs.append(
            pd.DataFrame(loom.ca[prediction_ca_name],
                         columns=[prediction_ca_name],
                         copy=True))
    guide_prediction_dfs = pd.concat(guide_prediction_dfs, axis=1)
    loom.ca[nguide_ca] = guide_prediction_dfs.sum(axis=1).values

    loom.ca[cell_prediction_summary_ca] = guide_prediction_dfs.apply(
        lambda x: '+'.join(guide_prediction_dfs.columns[np.where(x == 1)[0]]),
        axis=1).values