def plot_HalfNorm2(): #X = np.random.normal(-2, 1, size=(1000, 1)) ## scipy runtime warning (possibly due to running outdated version) X = np.random.sample(size=(1000, 1)) X[::2] += 4 modeln = GeneralMixtureModel.from_samples(NormalDistribution, 2, X) modelh = GeneralMixtureModel.from_samples(HalfNormalDistribution, 2, X) x = np.arange(-15, 15, 0.1) fig, ax = plt.subplots(figsize=(7, 4)) ax.plot(x, modeln.probability(x), label='Normal Mixture') ax.plot(x, set_y(x, modelh), label='Half Norm Mixture') ax.set_ylabel('Probability', fontsize=10) ax.legend(fontsize=10) plt.savefig('/scratch/chd5n/test.png', bbox_inches='tight') print('plot written to', '/scratch/chd5n/test.png')
def oriHMMParams(self): """ Set initial parameters for the Hidden Markov Model (HMM). Attributes ---------- HMMParams : dict Has 3 keys: "A", state transition matrix, "B" (emission probabilities), specifying parameters (Means, Variances, Weights) of the mixture Gaussian distributions for each hidden state, and "pi", indicating the hidden state weights. This dict will be updated after learning procedure. """ hmm = HiddenMarkovModel() # GMM emissions # 5 Hidden States: # 0--start, 1--downstream, 2--no bias, 3--upstream, 4--end numdists = 3 # Three-distribution Gaussian Mixtures var = 7.5 / (numdists - 1) means = [[], [], [], [], []] for i in range(numdists): means[4].append(i * 7.5 / (numdists - 1) + 2.5) means[3].append(i * 7.5 / (numdists - 1)) means[2].append((i - (numdists - 1) / 2) * 7.5 / (numdists - 1)) means[1].append(-i * 7.5 / (numdists - 1)) means[0].append(-i * 7.5 / (numdists - 1) - 2.5) states = [] for i, m in enumerate(means): tmp = [] for j in m: tmp.append(NormalDistribution(j, var)) mixture = GeneralMixtureModel(tmp) states.append(State(mixture, name=str(i))) hmm.add_states(*tuple(states)) # Transmission matrix #A = [[0., 1., 0., 0., 0.], # [0., 0.4, 0.3, 0.3, 0.], # [0.05, 0., 0.5, 0.45, 0.], # [0., 0., 0., 0.5, 0.5], # [0.99, 0., 0.01, 0., 0.]] hmm.add_transition(states[0], states[1], 1) hmm.add_transition(states[1], states[1], 0.4) hmm.add_transition(states[1], states[2], 0.3) hmm.add_transition(states[1], states[3], 0.3) hmm.add_transition(states[2], states[0], 0.05) hmm.add_transition(states[2], states[2], 0.5) hmm.add_transition(states[2], states[3], 0.45) hmm.add_transition(states[3], states[3], 0.5) hmm.add_transition(states[3], states[4], 0.5) hmm.add_transition(states[4], states[0], 0.99) hmm.add_transition(states[4], states[2], 0.01) pi = [0.05, 0.3, 0.3, 0.3, 0.05] for i in range(len(states)): hmm.add_transition(hmm.start, states[i], pi[i]) hmm.bake() return hmm
def load_segmentation_model(modeldata): model = HiddenMarkovModel('model') states = {} for s in modeldata: if len(s['emission']) == 1: emission = NormalDistribution(*s['emission'][0][:2]) else: weights = np.array([w for _, _, w in s['emission']]) dists = [NormalDistribution(mu, sigma) for mu, sigma, _ in s['emission']] emission = GeneralMixtureModel(dists, weights=weights) state = State(emission, name=s['name']) states[s['name']] = state model.add_state(state) if 'start_prob' in s: model.add_transition(model.start, state, s['start_prob']) for s in modeldata: current = states[s['name']] for nextstate, prob in s['transition']: model.add_transition(current, states[nextstate], prob) model.bake() return model
def __init__(self, amps, funcs, limits=(d.min_x, d.max_x)): """ Object to define a mixture probability distribution Parameters ---------- amps: ndarray, float array with one relative amplitude per component funcs: list, chippr.gauss or chippr.discrete objects list of components limits: tuple or list or numpy.ndarray, float, optional minimum and maximum sample values to return """ self.amps = amps/np.sum(amps) self.cumamps = np.cumsum(self.amps) self.n_comps = len(self.amps) self.funcs = funcs#[chippr.gauss(self.means[c], self.sigmas[c]**2) for c in range(self.n_comps)] # print('gmix before:') # for c in range(self.n_comps): # print('gmix '+str((c, type(self.funcs[c])))) self.funcs = [func.dist for func in self.funcs] # print('gmix after:') # for c in range(self.n_comps): # print('gmix '+str((c, type(self.funcs[c])))) self.dims = np.shape(np.array(limits).T)[0] self.min_x = limits[0] self.max_x = limits[1] # print("amps="+str(self.amps)) self.dist = GMM(self.funcs, weights=self.amps)
def __init__(self, bin_ends, weights): """ Binned function class for any discrete function Parameters ---------- bin_ends: numpy.ndarray, float endpoints of bins weights: numpy.ndarray, float relative weights associated with each bin Notes ----- TO DO: Rename to piecewise constant or somesuch """ self.bin_ends = bin_ends self.dbins = self.bin_ends[1:] - self.bin_ends[:-1] self.n_bins = len(self.bin_ends)-1 self.bin_range = range(self.n_bins) self.weights = weights # print('dbins: '+str(self.dbins)) # print('weights: '+str((self.weights))) # print('sumweights: '+str((np.sum(self.weights)))) # print('dotweights: '+str((np.dot(self.weights, self.dbins)))) self.normweights = np.cumsum(self.weights) / np.sum(self.weights) # print('normweights: '+str(self.normweights)) self.distweights = np.cumsum(self.weights) / np.dot(self.weights, self.dbins) # print('distweights: '+str(self.distweights)) self.funcs = [UD(self.bin_ends[i], self.bin_ends[i+1]) for i in self.bin_range] if self.n_bins > 1: self.dist = GMM(self.funcs, weights=self.weights) else: self.dist = self.funcs[0]
def addData(self, data, score): score = score.clip(min=1e-5) self.data = data self.score = score score_normed = self.score / np.linalg.norm(self.score, ord=1) try: model = GeneralMixtureModel.from_samples( MultivariateGaussianDistribution, n_components=self.n_comp, X=self.data, weights=score_normed) self.model = model except: logging.info("catched an exception")
def fit_mixture(self, pos_left, pos_right, weights, n_components=2, tol=1e-4, maxiter=4000, verbose=False): left, right = np.asarray(pos_left), np.asarray(pos_right) weights = np.asarray(weights) debugs = list() if verbose else None centers = (left + right) / 2.0 init_gmm = GeneralMixtureModel.from_samples( MultivariateGaussianDistribution, n_components=n_components, X=centers, weights=weights, stop_threshold=0.01, n_jobs=2) init_mus, init_covs = list(), list() init_comp_ws = np.array(init_gmm.weights) init_comp_ws /= np.sum(init_comp_ws) for i in range(n_components): paras = init_gmm.distributions[i].parameters init_mus.append(np.array(paras[0])) init_covs.append(np.array(paras[1])) init_paras = self._paras_compose_(init_mus, init_covs, list(init_comp_ws)) method = 'Nelder-Mead' res = opt.minimize(self._mixture_optpara, init_paras, args=(left, right, weights, n_components, debugs), method=method, tol=tol, options={ 'maxiter': maxiter, 'disp': verbose }) if verbose: print("Method:{}; Initial parameter: {};".format( method, init_paras)) print("Converged Parameter: {}".format(res.x)) mus, covs, comp_ws = self._paras_decompose_(res.x, n_components) return mus, covs, comp_ws, res.fun
def ghmm_model(states_labels: tuple, transitions: tuple, init_prob: tuple, end_prob: tuple, means: list, vars: list) -> HiddenMarkovModel: """ :param states_labels: :param transitions: :param init_prob: :param end_prob: :param means: :param vars: :return: """ hmm_model = HiddenMarkovModel() mix_num = len(vars[0]) states = [] for state_i, state in enumerate(states_labels): mixture = [] for mix_i in range(mix_num): init_mean = means[state_i][mix_i] init_var = vars[state_i][mix_i] mixture.append(NormalDistribution(init_mean, init_var)) states.append(State(GeneralMixtureModel(mixture), name=str(state_i))) hmm_model.add_states(*tuple(states)) for row in range(len(states_labels)): for col in range(len(states_labels)): prob = transitions[row][col] if prob != 0.: hmm_model.add_transition(states[row], states[col], prob) for state_i, prob in enumerate(init_prob): if prob != 0.: hmm_model.add_transition(hmm_model.start, states[state_i], prob) for state_i, prob in enumerate(end_prob): if prob != 0.: hmm_model.add_transition(states[state_i], hmm_model.end, prob) hmm_model.bake() return hmm_model
def oriHMMParams(self, numdists=3): """ Set initial parameters for the Hidden Markov Model (HMM). """ # GMM emissions # 3 Hidden States: # 0--downstream, 1--no bias, 2--upstream if numdists == 1: dists = [ NormalDistribution(-2.5, 7.5), NormalDistribution(0, 7.5), NormalDistribution(2.5, 7.5) ] else: var = 7.5 / (numdists - 1) means = [[], [], []] for i in range(numdists): means[0].append(i * 7.5 / (numdists - 1) + 2.5) means[1].append(i * 7.5 * (-1)**i / (numdists - 1)) means[2].append(-i * 7.5 / (numdists - 1) - 2.5) dists = [] for i, m in enumerate(means): tmp = [] for j in m: tmp.append(NormalDistribution(j, var)) mixture = GeneralMixtureModel(tmp) dists.append(mixture) # transition matrix A = [[0.34, 0.33, 0.33], [0.33, 0.34, 0.33], [0.33, 0.33, 0.34]] starts = np.ones(3) / 3 hmm = HiddenMarkovModel.from_matrix(A, dists, starts, state_names=['0', '1', '2'], name='mixture{0}'.format(numdists)) return hmm
def fit_mixture_model(counts): ''' Code adapted from https://github.com/josephreplogle/guide_calling ''' data = np.log2(counts + 1) reshaped_data = data.reshape(-1, 1) xs = np.linspace(-2, max(data) + 2, 1000) # Re-fit the model until it has converged with both components given non-zero weight # and the Poisson component in the first position with lower mean. while True: model = GeneralMixtureModel.from_samples( [PoissonDistribution, NormalDistribution], 2, reshaped_data) if 0 in model.weights: # One component was eliminated continue elif np.isnan(model.probability(xs)).any(): continue elif model.distributions[0].parameters[0] > model.distributions[ 1].parameters[0]: continue elif model.distributions[0].name != 'PoissonDistribution': continue else: break labels = model.predict(reshaped_data) xs = np.linspace(0, max(data) + 2, 1000) p_second_component = model.predict_proba(xs.reshape(-1, 1))[:, 1] threshold = 2**xs[np.argmax(p_second_component >= 0.5)] return labels, threshold
def getMixtureModelCutOff(samples,alpha,mu,sigma): mixture_m = GeneralMixtureModel([ ExponentialDistribution(alpha), NormalDistribution(mu,sigma)] ) model = mixture_m.fit(samples.reshape(-1,1)) pred_alpha = model.distributions[0].parameters[0] return expon.ppf(0.95,0,1/pred_alpha)
def _segment(self, arr, components=2): nonzero = arr[arr > 0] idx = self.hampel_filter(np.log2(nonzero)) filtered = nonzero[idx] log_gmm = self.get_states(np.log2(filtered)) log_means, log_probs = log_gmm.means_.ravel(), log_gmm.weights_ ln_gmm = self.get_states(filtered) # to improve the sensitivity ln_means, ln_probs = ln_gmm.means_.ravel(), ln_gmm.weights_ if (len(log_means) == 1): means, probs = ln_means, ln_probs scale = 'linear' else: means, probs = log_means, log_probs scale = 'log' logger.info('Estimated HMM state number: {0} ({1} scale)'.format(len(means), scale)) model = HiddenMarkovModel() # GMM emissions dists = [] for m in means: tmp = [] for i in range(components): e = m + (-1)**i * ((i+1)//2) * 0.5 s = 0.5 tmp.append(NormalDistribution(e, s)) mixture = State(GeneralMixtureModel(tmp), name=str(m)) dists.append(mixture) model.add_states(*tuple(dists)) # transition matrix for i in range(len(means)): for j in range(len(means)): if i==j: model.add_transition(dists[i], dists[j], 0.8) else: model.add_transition(dists[i], dists[j], 0.2/(len(means)-1)) # starts and ends for i in range(len(means)): model.add_transition(model.start, dists[i], probs[i]) model.bake() # training sequences tmp = np.zeros(nonzero.size) tmp[idx] = filtered newarr = np.zeros(arr.size) newarr[arr > 0] = tmp if len(means) > 1: model.fit(self.pieces(newarr, scale=scale), algorithm='baum-welch', n_jobs=self.n_jobs, max_iterations=5000, stop_threshold=2e-4) queue = newarr[newarr > 0] if scale=='log': seq = np.r_[[s.name for i, s in model.viterbi(np.log2(queue))[1][1:]]] else: seq = np.r_[[s.name for i, s in model.viterbi(queue)[1][1:]]] seg = self.assign_cnv(queue, seq) predicted = np.zeros(newarr.size) predicted[newarr > 0] = seg seg = self.call_intervals(predicted) else: seg = [(0, newarr.size)] return newarr, seg, scale
def mixture(self, args): weights = args[0] distributions = args[1] return GeneralMixtureModel(distributions, weights=weights)
def __init__(self, dim , seed=None): # K = 9 theta0=[.5,.5] beta=np.ones(K) Psi = .1*np.diag(np.ones(dim)) #mu0= np.zeros(dim) #lambd=.1, nu=dim+2. rstate = np.random.get_state() np.random.seed(seed) unif_dist = UniformDistribution(0.,1.) self.theta0 = theta0 beta_dist = DirichletDistribution(beta) self.dim = Psi.shape[0] self.dists = [] #same weights for both weights = beta_dist.sample() mus = [] for i,_ in enumerate(theta0): #weights = beta_dist.sample() #print(weights) mix = [] for j,_ in enumerate(weights): if j%3==0: Sigma = invwishart.rvs(df=nu, scale=Psi) elif j%3==1: Sigma = invwishart.rvs(df=nu, scale=.01*Psi) else: Sigma = invwishart.rvs(df=nu, scale=.0001*Psi) if i==0: mu = unif_dist.sample(self.dim) #mu =MultivariateGaussianDistribution(mu0,Sigma/lambd).sample() mus.append(mu) else: mu = mus[j] mix.append( MultivariateGaussianDistribution(mu, Sigma) ) model = GeneralMixtureModel(mix, weights=weights) self.dists.append(model) for d in self.dists: print(d) self.rstate = np.random.get_state() np.random.set_state(rstate)
from pomegranate import ( NaiveBayes, NormalDistribution, UniformDistribution, ExponentialDistribution, GeneralMixtureModel, MultivariateGaussianDistribution, BernoulliDistribution, ) import pandas as pd import numpy as np X = pd.DataFrame({"A": [1, 0, 1, 0, 1], "B": [1, 1, 1, 1, 0]}) x = BernoulliDistribution(0.4) vals = [] [vals.append(x.sample()) for i in range(1000)] model = NaiveBayes([ NormalDistribution(5, 2), UniformDistribution(0, 10), ExponentialDistribution(1.0) ]) model.predict(np.array([[10]])) model = GeneralMixtureModel.from_samples(MultivariateGaussianDistribution, n_components=3, X=X)
# for now, no ratio in data (no rates A, B or C in this dataset) only_flux = True scale_flux = False hdulist = pyfits.open('../iirc_data/all_data_for_ml.fits') data = hdulist[1].data X_flux, X, data_thr, data_fr_en, data_fr_err = get_iirc_data( data, only_flux=only_flux, scale_flux=scale_flux, thresholded=True) # GMM with 3 components: np.random.seed(0) gmm = GeneralMixtureModel(MultivariateGaussianDistribution, n_components=3) gmm.fit(X) preds = gmm.predict(X) probs = gmm.predict_proba(X) data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "yellow", "blue", "grey", "black", "purple", "pink", "brown", "green", "orange"] # Spectral9 color_key = color_key[:len(set(preds))+1] covs = np.array([np.array(gmm.distributions[m].parameters[1]) for m in range(len(gmm.distributions))]) means = np.array([np.array(gmm.distributions[m].parameters[0]) for m in range(len(gmm.distributions))])
class discrete(object): def __init__(self, bin_ends, weights): """ Binned function class for any discrete function Parameters ---------- bin_ends: numpy.ndarray, float endpoints of bins weights: numpy.ndarray, float relative weights associated with each bin Notes ----- TO DO: Rename to piecewise constant or somesuch """ self.bin_ends = bin_ends self.dbins = self.bin_ends[1:] - self.bin_ends[:-1] self.n_bins = len(self.bin_ends)-1 self.bin_range = range(self.n_bins) self.weights = weights # print('dbins: '+str(self.dbins)) # print('weights: '+str((self.weights))) # print('sumweights: '+str((np.sum(self.weights)))) # print('dotweights: '+str((np.dot(self.weights, self.dbins)))) self.normweights = np.cumsum(self.weights) / np.sum(self.weights) # print('normweights: '+str(self.normweights)) self.distweights = np.cumsum(self.weights) / np.dot(self.weights, self.dbins) # print('distweights: '+str(self.distweights)) self.funcs = [UD(self.bin_ends[i], self.bin_ends[i+1]) for i in self.bin_range] if self.n_bins > 1: self.dist = GMM(self.funcs, weights=self.weights) else: self.dist = self.funcs[0] def pdf(self, xs): return self.evaluate(xs) def evaluate_one(self, x): """ Function to evaluate the discrete probability distribution at one point Parameters ---------- x: float value at which to evaluate discrete probability distribution Returns ------- p: float value of discrete probability distribution at x """ p = self.dist.probability(x) return p def evaluate(self, xs): """ Function to evaluate the discrete probability distribution at many points Parameters ---------- xs: ndarray, float values at which to evaluate discrete probability distribution Returns ------- ps: ndarray, float values of discrete probability distribution at xs """ # ps = np.array([self.evaluate_one(x) for x in xs]) ps = self.dist.probability(xs) return ps def sample_one(self): """ Function to sample a single value from discrete probability distribution Returns ------- x: float a single point sampled from the discrete probability distribution """ # r = np.random.random() # k = bisect.bisect(self.normweights, r) # # x = np.random.uniform(low=self.bin_ends[k], high=self.bin_ends[k+1]) x = self.dist.sample(1) return x def sample(self, n_samps): """ Function to take samples from discrete probability distribution Parameters ---------- n_samps: int number of samples to take Returns ------- xs: ndarray, float array of points sampled from the discrete probability distribution """ # print('discrete trying to sample '+str(n_samps)+' from '+str(self.dist)) # xs = np.array([self.sample_one() for n in range(n_samps)]) xs = np.array(self.dist.sample(n_samps)) # print('discrete sampled '+str(n_samps)+' from '+str(self.dist)) return xs
def _initDists(self, X, distribution=MultivariateGaussianDistribution): technique = "R_MV-GMM" # mixture of multivariate gaussain distribution if (technique == "GMM"): # gaussian mixture model #// uvgd = NormalDistribution.from_samples(X) #// gmm = GeneralMixtureModel([uvgd.copy() for _ in range(self.nmix)]) gmm = GeneralMixtureModel.from_samples( distributions=[NormalDistribution for _ in range(self.nmix)], X=X) dists = [gmm.copy() for _ in range(self.statesNumber)] elif (technique == "MV-GMM"): # multivariate gaussian mixture model #// mvgd = MultivariateGaussianDistribution.from_samples(X) #// gmm = GeneralMixtureModel([mvgd.copy() for _ in range(self.nmix)]) gmm = GeneralMixtureModel.from_samples(distributions=[ MultivariateGaussianDistribution for _ in range(self.nmix) ], X=X, n_components=3) dists = [gmm.copy() for _ in range(self.statesNumber)] elif (technique == "MVG"): self._initkmeans(X=X, numClasses=self.statesNumber) dists = [ MultivariateGaussianDistribution.from_samples(X=X[y == i]) for i in range(self.statesNumber) ] elif (technique == "R_GMM"): # random gaussian mixture model randNormal = lambda: NormalDistribution(np.random.randint(1, 10), 1 ) randGMM = lambda: GeneralMixtureModel( [randNormal() for _ in range(self.nmix)]) dists = [randGMM() for _ in range(self.statesNumber)] elif (technique == "R_MV-GMM"): # random multivariate gaussian mixture model randGMM = lambda: GeneralMixtureModel( [randMVG() for _ in range(self.nmix)]) dists = [randGMM() for _ in range(self.statesNumber)] return dists #* not completed: #! GMM-HMM-k y = self._initkmeans(X, self.statesNumber) # list(map(print, y)) return [ GeneralMixtureModel.from_samples(distribution, X=X[y == i], n_components=self.nmix) for i in range(self.statesNumber) ] #! Kmeans init if not isinstance(X, BaseGenerator): data_generator = SequenceGenerator(X, None, None) else: data_generator = X initialization_batch_size = len(data_generator) X_ = [] data = data_generator.batches() for i in range(initialization_batch_size): batch = next(data) X_.extend(batch[0]) X_concat = np.concatenate(X_) if X_concat.ndim == 1: X_concat = X_concat.reshape(X_concat.shape[0], 1) n, d = X_concat.shape clf = Kmeans(self.statesNumber, init="kmeans++", n_init=1) # init should be one of clf.fit(X_concat, max_iterations=None, batches_per_epoch=None) y = clf.predict(X_concat) if callable(distribution): if d == 1: dists = [ distribution.from_samples(X_concat[y == i][:, 0]) for i in range(self.statesNumber) ] elif distribution.blank().d > 1: dists = [ distribution.from_samples(X_concat[y == i]) for i in range(self.statesNumber) ] else: print("error") return dists
class gmix(object): def __init__(self, amps, funcs, limits=(d.min_x, d.max_x)): """ Object to define a mixture probability distribution Parameters ---------- amps: ndarray, float array with one relative amplitude per component funcs: list, chippr.gauss or chippr.discrete objects list of components limits: tuple or list or numpy.ndarray, float, optional minimum and maximum sample values to return """ self.amps = amps/np.sum(amps) self.cumamps = np.cumsum(self.amps) self.n_comps = len(self.amps) self.funcs = funcs#[chippr.gauss(self.means[c], self.sigmas[c]**2) for c in range(self.n_comps)] # print('gmix before:') # for c in range(self.n_comps): # print('gmix '+str((c, type(self.funcs[c])))) self.funcs = [func.dist for func in self.funcs] # print('gmix after:') # for c in range(self.n_comps): # print('gmix '+str((c, type(self.funcs[c])))) self.dims = np.shape(np.array(limits).T)[0] self.min_x = limits[0] self.max_x = limits[1] # print("amps="+str(self.amps)) self.dist = GMM(self.funcs, weights=self.amps) def pdf(self, xs): return self.evaluate(xs) def evaluate_one(self, x): """ Function to evaluate Gaussian mixture once Parameters ---------- x: float value at which to evaluate Gaussian mixture Returns ------- p: float probability associated with x """ # p = 0. # for c in range(self.n_comps): # p += self.amps[c] * self.funcs[c].evaluate_one(x) p = self.dist.probability(x) return p def evaluate(self, xs): """ Function to evaluate the Gaussian mixture probability distribution at many points Parameters ---------- xs: ndarray, float values at which to evaluate Gaussian mixture probability distribution Returns ------- ps: ndarray, float values of Gaussian mixture probability distribution at xs """ # ps = np.zeros(len(xs)) # for c in range(self.n_comps): # ps += self.amps[c] * self.funcs[c].evaluate(xs) ps = self.dist.probability(xs) return ps def sample_one(self): """ Function to sample a single value from Gaussian mixture probability distribution Returns ------- x: float a single point sampled from the Gaussian mixture probability distribution """ # x = -1. * np.ones(self.dims) # #don't do this every time! # min_x = self.min_x * np.ones(self.dims) # max_x = self.max_x * np.ones(self.dims) # # while np.any(np.less(x, min_x)) or np.any(np.greater(x, max_x)): # r = np.random.uniform(0., self.cumamps[-1]) # c = 0 # for k in range(1, self.n_comps): # if r > self.cumamps[k-1]: # c = k # x = self.funcs[c].sample_one() x = self.dist.sample(1) return x def sample(self, n_samps): """ Function to take samples from Gaussian mixture probability distribution Parameters ---------- n_samps: int number of samples to take Returns ------- xs: ndarray, float array of points sampled from the Gaussian mixture probability distribution """ # print('gmix trying to sample '+str(n_samps)+' from '+str(self.dist)) # xs = np.array([self.sample_one() for n in range(n_samps)]) # print(self.dist.to_json) xs = np.array(self.dist.sample(n_samps)) # print('gmix sampled '+str(n_samps)+' from '+str(self.dist)) return xs
def randMVGMM(n=40, nmix=5, dist=MultivariateGaussianDistribution): ''' generate random gaussian mixture model of multivariate distribution ''' dists = [randMVG(n=n) for _ in range(nmix)] return GeneralMixtureModel(dists)
time_list = [100, 500, 900, 1500] for time in time_list: samples = hiker_paths.get_all_at_time(time) weights = p_filter.weighting_func(log_weights) print(weights) print(samples) # NormalDistribution samples = [[float(item[0]), float(item[1])] for item in samples] test = np.random.multivariate_normal([50, 50], [[1, 0], [0, 1]], 10) print(test) gmm = GeneralMixtureModel.from_samples(MultivariateGaussianDistribution, n_components=4, X=samples, weights=weights) clf = pomegranate_to_scikitlearn(gmm) graph_shape = depth_dict["data"].shape print(graph_shape) # display predicted scores by the model as a contour plot ax = plt.subplot(111) x = np.linspace(0.0, graph_shape[0]) y = np.linspace(0.0, graph_shape[1]) X, Y = np.meshgrid(x, y) XX = np.array([X.ravel(), Y.ravel()]).T
# print("Naive Bayes - Semisupervised Learning Accuracy: {}".format((model_b.predict(x_val) == y_val).mean())) model_c = BayesClassifier.from_samples(MultivariateGaussianDistribution, x_train, y_train, inertia=0.0, pseudocount=0.0, stop_threshold=0.1, max_iterations=100, verbose=True, n_jobs=1) print("Bayes Classifier - Semisupervised Learning Accuracy: {}".format( (model_c.predict(x_val) == y_val).mean())) # general mixture model d0 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 0]) d1 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 1]) d2 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 2]) d3 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 3]) d4 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 4]) d5 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 5]) d6 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 6]) d7 = GeneralMixtureModel.from_samples(NormalDistribution, 2, x_train[y_train == 7]) d8 = GeneralMixtureModel.from_samples(NormalDistribution, 2,
np.random.seed(0) X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB, data_thr.rateC, data_thr.rateCA] Html_file = open("gmm_pomegranate_files/gmm3w_pomegranate.html", "w") scaler = StandardScaler() X = scaler.fit_transform(X) # 1 corresponds to data_thr.rate and 4=5-1 to data_thr.rateC w = w / np.sqrt(scaler.var_[1:]) # w = np.exp(-np.exp(3 * w.mean(axis=1))) w = 1. / w.mean(axis=1) ** 2 gmm = GeneralMixtureModel(MultivariateGaussianDistribution, n_components=3) gmm.fit(X, weights=w) preds = gmm.predict(X) probs = gmm.predict_proba(X) data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] # Spectral9 color_key = color_key[:len(set(preds))+1] covs = np.array([np.array(gmm.distributions[m].parameters[1]) for m in range(len(gmm.distributions))]) means = np.array([np.array(gmm.distributions[m].parameters[0]) for m in range(len(gmm.distributions))])
def generate_guide_rna_prediction( loom, guide_rnas, nguide_ca='nGuide', nguide_reads_ca='nGuideReads', cell_prediction_summary_ca='CellGuidePrediction', overwrite=False, only_generate_log2=False, ncell_threshold_for_guide=10, nguide_threshold_for_cell=10): """ This approach is inspired by Replogle et a. 2018 (https://doi.org/10.1038/s41587-020-0470-y). However, instead of a Gaussian/Poisson mixture, this routine uses a Poisson/Poisson mixture. This routine uses the pomegranate package (https://github.com/jmschrei/pomegranate). Parameters ---------- loom : LoomConnection A LoomConnection object upon which guide rna predictions will be made guide_rnas : iterable of strings a list or other iterable of the strings, each corresponding to a column attribute of `loom` indicate the raw counts of a given guide RNA over cells nguide_ca : str QC metric, indicating the name of the column attribute to use to indicate the number of predicted guide RNAs for a cell (Default value = 'nGuide') nguide_reads_ca : QC metric, indicating the name of the column attribute to use to indicate the total number of guide RNA reads for a cell(Default value = 'nGuideReads') cell_prediction_summary_ca : str Indicates the name of the column attribute to use to indicate a summary of positively-predicted guide RNAs for a cell(Default value = 'CellGuidePrediction') overwrite : bool If False, will raise exception if requested column attributes have already been written. If True, will overwrite existing column attributes. (Default value = False) only_generate_log2 : bool If true, will generate log2 guide RNA counts, but will not apply any mixture model prediction. (Default value = False) ncell_threshold_for_guide : int Threshold for the number of cells wherein guide should have nonzero counts for mixture model to attempt prediction. (Default value = 10) nguide_threshold_for_cell : int Threshold for the number of guides to be detected in a given cell to attempt to make a prediction for that particular cell. (Default value = 10) Returns ------- """ from panopticon.utilities import import_check exit_code = import_check("pomegranate", 'conda install -c anaconda pomegranate') if exit_code != 0: return import pandas as pd if nguide_reads_ca in loom.ca.keys() and overwrite == False: raise Exception( "{} already in loom.ca.keys(); if intended, set overwrite argument to True" .format(nguide_reads_ca)) guide_rna_dfs = [] for guide_rna in guide_rnas: guide_rna_dfs.append( pd.DataFrame(loom.ca[guide_rna], columns=[guide_rna], copy=True)) guide_rna_dfs = pd.concat(guide_rna_dfs, axis=1) loom.ca[nguide_reads_ca] = guide_rna_dfs.sum(axis=1).values threshold_for_cell_mask = loom.ca[ nguide_reads_ca] >= nguide_threshold_for_cell prediction_ca_names = [] for guide_rna in guide_rnas: if guide_rna not in loom.ca.keys(): raise Exception( "raw_antibody_count_df must be prepared such that columns match column attributes in loom corresponding to raw antibody conjugate counts" ) new_ca_name = guide_rna + '_log2' if new_ca_name in loom.ca.keys() and overwrite == False: raise Exception( "{} already in loom.ca.keys(); rename guide column attribute and re-run, or set overwrite argument to True" .format(new_ca_name)) loom.ca[new_ca_name] = np.log2(loom.ca[guide_rna]) if not only_generate_log2: from pomegranate import GeneralMixtureModel, PoissonDistribution prediction_ca_name = guide_rna + '_prediction' prediction_ca_names.append(prediction_ca_name) if prediction_ca_name in loom.ca.keys() and overwrite == False: raise Exception( "{} already in loom.ca.keys(); rename guide rna column attribute and re-run, or set overwrite argument to True" .format(prediction_ca_name)) if (~np.isfinite(loom.ca[new_ca_name])).sum() > 0: cellmask = np.isfinite(loom.ca[new_ca_name]) if cellmask.sum( ) >= ncell_threshold_for_guide: # have minimum cells for guide model = GeneralMixtureModel.from_samples( [PoissonDistribution, PoissonDistribution], n_components=2, X=loom.ca[new_ca_name][cellmask.nonzero()[0]].reshape( -1, 1)) predictions = [] for val in loom.ca[new_ca_name]: if not np.isfinite(val): predictions.append(np.nan) else: predictions.append( model.predict(np.array(val).reshape(-1, 1))[0]) else: predictions = [0] * loom.shape[1] predictions = np.array(predictions) else: #print(guide_rna, loom.ca[guide_rna].sum()) # print('Warning: pomegrante Poisson/Normal mixture model has predicted a Poisson component with greater log(UMI+1) counts than normal component. This is unusual behavior!') model = GeneralMixtureModel.from_samples( [PoissonDistribution, PoissonDistribution], n_components=2, X=loom.ca[new_ca_name].reshape(-1, 1)) #model.fit(loom.ca[new_ca_name].reshape(-1, 1)) predictions = model.predict(loom.ca[new_ca_name].reshape( -1, 1)) if loom.ca[new_ca_name][np.array(predictions) == 0].mean( ) > loom.ca[new_ca_name][np.array(predictions) == 1].mean(): predictions = 1 - predictions predictions = np.array(predictions) predictions = np.nan_to_num(predictions, nan=0.0) predictions *= threshold_for_cell_mask loom.ca[prediction_ca_name] = predictions guide_prediction_dfs = [] for prediction_ca_name in prediction_ca_names: guide_prediction_dfs.append( pd.DataFrame(loom.ca[prediction_ca_name], columns=[prediction_ca_name], copy=True)) guide_prediction_dfs = pd.concat(guide_prediction_dfs, axis=1) loom.ca[nguide_ca] = guide_prediction_dfs.sum(axis=1).values loom.ca[cell_prediction_summary_ca] = guide_prediction_dfs.apply( lambda x: '+'.join(guide_prediction_dfs.columns[np.where(x == 1)[0]]), axis=1).values