def loglikelihoods_from_seqs(self, X, bPosterior=False, bIdx=False, startIdx=1): ''' X: sample x (length*dim) return: the likelihoods over time (in single data) ''' ll_likelihoods = [] ll_posteriors = [] for i in xrange(len(X)): l_likelihood = [] l_posterior = [] for j in xrange(startIdx, len(X[i]) / self.nEmissionDim): if isinstance(X[i], np.ndarray) or isinstance(X[i], list): try: final_ts_obj = ghmm.EmissionSequence( self.F, X[i, :j * self.nEmissionDim].tolist()) except: print "failed to make sequence" continue else: final_ts_obj = ghmm.EmissionSequence( self.F, list(X[i])[:j * self.nEmissionDim]) try: logp = self.ml.loglikelihood(final_ts_obj) if bPosterior: post = np.array(self.ml.posterior(final_ts_obj)) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" continue ## return False, False # anomaly #continue l_likelihood.append(logp) if bPosterior: l_posterior.append(post[j - 1]) ll_likelihoods.append(l_likelihood) if bPosterior: ll_posteriors.append(l_posterior) if bIdx: ll_idx = [] for ii in xrange(len(X)): l_idx = [] for jj in xrange(startIdx, len(X[ii]) / self.nEmissionDim): l_idx.append(jj) ll_idx.append(l_idx) if bPosterior: return ll_likelihoods, ll_posteriors, ll_idx else: return ll_likelihoods, ll_idx else: if bPosterior: return ll_likelihoods, ll_posteriors else: return ll_likelihoods
def _includeOutliers(models, trainData, outliers): means = [] stds = [] for i in range(len(models)): mean = 0 variance = 0 #Calculate model mean for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) a = abs(models[i].loglikelihood(eSeq)) #print a mean += a mean /= (len(trainData[i]) * 1.0) means.append(mean) #Calculate the model variance for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) v = abs(models[i].loglikelihood(eSeq)) variance += (mean - v)**2 variance /= (len(trainData[i]) * 1.0) std = variance**0.5 stds.append(std) #For each data element in outliers, check for the model that it most #fits. If the outlier fits the model within one standard deviation #include it back into the data. for tmp in outliers: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) best = -1 bestModel = -1 for j in range(len(models)): val = abs(models[j].loglikelihood(eSeq)) if val < best or best == -1: best = val bestModel = j #Determine if the best fit is "good" enough. #If it is, add the outlier back into the model if (best - means[bestModel]) < 1 * (stds[bestModel]): trainData[bestModel].append(tmp) outliers.remove(tmp) return trainData, outliers
def hidden_state_change_lik(self, ts): """ obs.: compare distribution of different states. If they are too similar than they are merged during the probability computation """ obs_seq = self.get_obs_seqs(ts.y) emission_seq = ghmm.EmissionSequence(self.emission_domain, obs_seq) # ALERT: I DON'T KNOW WHAT scale IS forward, scale = self.model.forward(emission_seq) backward = self.model.backward(emission_seq, scale) n = len(self.A) ts_hidden_state_change_lik = time_series.dist_ts(ts) for t in xrange(len(ts.y) - 1): lik = 0.0 for i in xrange(n): for j in xrange(n): if self.states_are_diff(i, j): lik += self.xi(n, forward, backward, ts.y[t + 1], i, j, t) ts_hidden_state_change_lik.x.append(ts.x[t]) ts_hidden_state_change_lik.y.append(lik) return ts_hidden_state_change_lik
def predict2(self, X, x1, x2, x3, x4): X = np.squeeze(X) X_test = X.tolist() n = len(X_test) mu_l = np.zeros(self.nEmissionDim) cov_l = np.zeros(self.nEmissionDim**2) final_ts_obj = ghmm.EmissionSequence(self.F, X_test + [x1, x2, x3, x4]) try: (alpha, scale) = self.ml.forward(final_ts_obj) except: if self.verbose: print "No alpha is available !!" sys.exit() alpha = np.array(alpha) for j in xrange(self.nState): [[mu1, mu2, mu3, mu4], [cov11, cov12, cov13, cov14, cov21, cov22, cov23, cov24, cov31, cov32, cov33, cov34, cov41, cov42, cov43, cov44]] = self.B[j] mu_l[0] = x1 mu_l[1] += alpha[n/self.nEmissionDim, j] * (mu2 + cov21/cov11*(x1 - mu1) ) mu_l[2] += alpha[n/self.nEmissionDim, j] * (mu3 + cov31/cov21*(x2 - mu2)) # TODO Where does this come from? mu_l[3] += alpha[n/self.nEmissionDim, j] * (mu4 + cov41/cov31*(x3 - mu3)) # TODO Where does this come from? ## cov_l[0] += (cov11)*(total**2) ## cov_l[1] += (cov12)*(total**2) ## cov_l[2] += (cov21)*(total**2) ## cov_l[3] += (cov22)*(total**2) return mu_l, cov_l
def allLikelihoods(self, X1, X2, X3, X4): # n, m = np.shape(X1) X_test = self.convert_sequence(X1, X2, X3, X4, emission=False) # i = m - 1 m = len(np.squeeze(X1)) ll_likelihood = np.zeros(m) ll_state_idx = np.zeros(m) ll_likelihood_mu = np.zeros(m) ll_likelihood_std = np.zeros(m) for i in xrange(1, m): final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0,:i*self.nEmissionDim].tolist()) logp = self.ml.loglikelihood(final_ts_obj) post = np.array(self.ml.posterior(final_ts_obj)) # Find the best posterior distribution min_index, min_dist = self.findBestPosteriorDistribution(post[i-1]) ll_likelihood[i] = logp ll_state_idx[i] = min_index ll_likelihood_mu[i] = self.ll_mu[min_index] ll_likelihood_std[i] = self.ll_std[min_index] #self.ll_mu[min_index] + ths_mult*self.ll_std[min_index] return ll_likelihood, ll_state_idx, ll_likelihood_mu, ll_likelihood_std
def predict2(self, X, x1): X = np.squeeze(X) X_test = X.tolist() n = len(X_test) mu_l = np.zeros(self.nEmissionDim) cov_l = np.zeros(self.nEmissionDim**2) final_ts_obj = ghmm.EmissionSequence(self.F, X_test + [x1]) try: (alpha, scale) = self.ml.forward(final_ts_obj) except: print "No alpha is available !!" sys.exit() alpha = np.array(alpha) for j in xrange(self.nState): [[mu1], [cov11]] = self.B[j] mu_l[0] = x1 return mu_l, cov_l
def test_backward_against_ghmm(self): from kerehmm.test.util import ghmm_from_discrete_hmm import ghmm hmm = self.new_hmm(random_emissions=True, random_transitions=True) hmm_reference = ghmm_from_discrete_hmm(hmm) observation_size = 10 observed = np.random.choice(range(self.nSymbols), size=observation_size).tolist() seq = ghmm.EmissionSequence(hmm_reference.emissionDomain, observed) _, scale = hmm.forward(observations=observed) # remember that we have to convert stuff from ghmm to log scale _, scale_reference = map(np.array, hmm_reference.forward(seq)) # print "Forward referece", forward print "Scale reference", scale_reference assert np.allclose(scale, scale_reference) # this is the reference backward array, untransformed (scaled) backward_reference = np.array( hmm_reference.backward(seq, scalingVector=scale_reference)) print "Backward reference (scaled)", backward_reference backward = hmm.backward(observed, scale_coefficients=scale) print "Backward", backward assert backward.shape == backward_reference.shape # test values # print "Diff:", np.exp(backward) - backward_reference # backward_unscaled = np.exp(backward) assert np.allclose(backward, backward_reference)
def forecast(self, data, future = 1): """Forecast for a model the probability of each observation. equation is: p(o_t+1) = sum_j(p(o_t+1|s_t+1^j)p(s_t+1^j) where p(s_t+1^j) is found through forward algorithm """ state = self.model.asMatrices()[0] observe = self.model.asMatrices()[1] ps1 = [0.0] * len(state[0]) po1 = [0.0] * len(observe[0]) tmp = ghmm.EmissionSequence(ghmm.Float(), data) ps = self.model.forward(tmp)[0][-1] for j in range(len(ps1)): for i in range(len(ps)): ps1[j] += state[i][j] * ps[i] for k in range(len(po1)): for j in range(len(ps1)): po1[k] += observe[j][k]*ps1[j] return po1[0]
def anomaly_check(self, X1, ths_mult=None): if self.nEmissionDim == 1: X_test = np.array([X1]) else: X_test = self.convert_sequence(X1, emission=False) try: final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist()) logp = self.ml.loglikelihood(final_ts_obj) except: if self.verbose: print "Too different input profile that cannot be expressed by emission matrix" return -1, 0.0 # error try: post = np.array(self.ml.posterior(final_ts_obj)) except: if self.verbose: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" return True, 0.0 # anomaly n = len(np.squeeze(X1)) # Find the best posterior distribution min_index, min_dist = self.findBestPosteriorDistribution(post[n - 1]) if (type(ths_mult) == list or type(ths_mult) == np.ndarray or type(ths_mult) == tuple) and len(ths_mult) > 1: err = logp - (self.ll_mu[min_index] + ths_mult[min_index] * self.ll_std[min_index]) else: err = logp - (self.ll_mu[min_index] + ths_mult * self.ll_std[min_index]) if err < self.anomaly_offset: return True, err else: return False, err
def learn_likelihoods_progress(i, n, m, A, B, pi, F, X_train, nEmissionDim, g_mu, g_sig, nState): if nEmissionDim >= 2: ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) else: ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) l_likelihood_mean = 0.0 l_likelihood_mean2 = 0.0 l_statePosterior = np.zeros(nState) for j in xrange(n): g_post = np.zeros(nState) g_lhood = 0.0 g_lhood2 = 0.0 prop_sum = 0.0 for k in xrange(1, m): final_ts_obj = ghmm.EmissionSequence(F, X_train[j][:k*nEmissionDim]) logp = ml.loglikelihoods(final_ts_obj)[0] # print 'Log likelihood:', logp post = np.array(ml.posterior(final_ts_obj)) k_prop = norm(loc=g_mu, scale=g_sig).pdf(k) g_post += post[k-1] * k_prop g_lhood += logp * k_prop g_lhood2 += logp * logp * k_prop prop_sum += k_prop l_statePosterior += g_post / prop_sum / float(n) l_likelihood_mean += g_lhood / prop_sum / float(n) l_likelihood_mean2 += g_lhood2 / prop_sum / float(n) return i, l_statePosterior, l_likelihood_mean, np.sqrt(l_likelihood_mean2 - l_likelihood_mean**2)
def train(self): # This tells GHMM every possible value that it will be seeing alphabet = ghmm.Alphabet(list(set(self.events))) alphaLen = len(alphabet) # Initiaize the probabilities of transitioning from each state to each other # state. There is probably a better way to do this, but this is nice and simple. trans_prob = 1.0 / (alphaLen) trans = [[trans_prob for row in range(alphaLen)] for col in range(alphaLen)] # Initialize the probabilities of seeing each output from each state. # Again, there is probably a better way to do this, but this is simple. emiss_prob = 1.0 / (alphaLen) emiss = [[emiss_prob for row in range(alphaLen)] for col in range(alphaLen)] # Some grease to get GHMM to work pi = [1.0 / alphaLen] * alphaLen # The sequence of musical events gathered from the music train_seq = ghmm.EmissionSequence(alphabet, self.events) # Generate the model of the data m = ghmm.HMMFromMatrices(alphabet, ghmm.DiscreteDistribution(alphabet), trans, emiss, pi) # Train the model based on the training sequence m.baumWelch(train_seq) return (m, alphabet)
def expLikelihoods(self, X, ths_mult=None): if self.nEmissionDim == 1: X_test = np.array([X[0]]) else: X_test = self.convert_sequence(X, emission=False) try: final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist()) logp = self.ml.loglikelihood(final_ts_obj) except: print "Too different input profile that cannot be expressed by emission matrix" return -1, 0.0 # error try: post = np.array(self.ml.posterior(final_ts_obj)) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" return 1.0, 0.0 # anomaly n = len(np.squeeze(X[0])) # Find the best posterior distribution min_index, min_dist = self.findBestPosteriorDistribution(post[n-1]) # print 'Computing anomaly' # print logp # print self.ll_mu[min_index] # print self.ll_std[min_index] # print 'logp:', logp, 'll_mu', self.ll_mu[min_index], 'll_std', self.ll_std[min_index], 'mult_std', ths_mult*self.ll_std[min_index] if (type(ths_mult) == list or type(ths_mult) == np.ndarray or type(ths_mult) == tuple) and len(ths_mult)>1: ## print min_index, self.ll_mu[min_index], self.ll_std[min_index], ths_mult[min_index], " = ", (self.ll_mu[min_index] + ths_mult[min_index]*self.ll_std[min_index]) return self.ll_mu[min_index] + ths_mult[min_index]*self.ll_std[min_index] else: return self.ll_mu[min_index] + ths_mult*self.ll_std[min_index]
def predict2(self, X, x1, x2): X = np.squeeze(X) X_test = X.tolist() n = len(X_test) mu_l = np.zeros(2) cov_l = np.zeros(4) final_ts_obj = ghmm.EmissionSequence(self.F, X_test + [x1, x2]) try: (alpha, scale) = self.ml.forward(final_ts_obj) except: print "No alpha is available !!" sys.exit() alpha = np.array(alpha) for j in xrange(self.nState): [[mu1, mu2], [cov11, cov12, cov21, cov22]] = self.B[j] mu_l[0] = x1 mu_l[1] += alpha[n / self.nEmissionDim, j] * (mu2 + cov21 / cov11 * (x1 - mu1)) ## cov_l[0] += (cov11)*(total**2) ## cov_l[1] += (cov12)*(total**2) ## cov_l[2] += (cov21)*(total**2) ## cov_l[3] += (cov22)*(total**2) return mu_l, cov_l
def test(self, model, ts_obj): # Find Viterbi Path final_ts_obj = ghmm.EmissionSequence(self.F,ts_obj.tolist()) path_obj = model.viterbi(final_ts_obj) return path_obj
def _predict_next(self): """@todo: Docstring for _predict_next. :returns: @todo """ a_init = normalize_stoch_map(np.random.rand(self._n_hid, self._n_hid)) b_init = normalize_stoch_map( np.random.rand(self._n_hid, self._n_sym**2)) pi_init = normalize_stoch_map(np.random.rand(self._n_hid)) hmm = gh.HMMFromMatrices(self._alphab, gh.DiscreteDistribution(self._alphab), a_init, b_init, pi_init) obs = gh.EmissionSequence(self._alphab, self._memory) hmm.baumWelch(obs) alpha = hmm.forward(obs)[0][-1] trans = hmm.asMatrices()[0] alpha = np.dot(alpha, trans) next_moves_dist = np.zeros(self._n_sym**2) for i in range(self._n_hid): next_moves_dist += np.asarray(hmm.getEmission(i)) * alpha[i] next_moves_dist = next_moves_dist[self._conversion_array] next_move = np.argmax(np.sum(next_moves_dist, axis=0)) return np.where(self._rules[next_move] == -1)[0][0]
def test_forward_against_ghmm(self): from .util import ghmm_from_discrete_hmm import ghmm hmm = self.new_hmm(random_transitions=True, random_emissions=True) hmm_reference = ghmm_from_discrete_hmm(hmm) observation_size = 10 observed = np.random.choice(range(self.nSymbols), size=observation_size).tolist() seq = ghmm.EmissionSequence(hmm_reference.emissionDomain, observed) forward, scale = hmm.forward(observed) # remember that we have to convert stuff from ghmm to log scale forward_reference, scale_reference = map(np.array, hmm_reference.forward(seq)) print "Forward reference (scaled):\n", forward_reference print "Scale reference: {}".format(scale_reference) # for i, c in enumerate(scale_reference): # forward_reference_log[i] += sum(np.log(scale_reference[:i + 1])) # print "Forward reference (unscaled):\n", np.exp(forward_reference_log) print "Forward:\n", forward print "Scale:\n", scale assert np.allclose(forward, forward_reference) assert np.allclose(scale, scale_reference)
def conditional_prob(self, x): ''' Input @ x: dim x length Output @ A list of conditional probabilities P(x_t|x_s,lambda) Only single sample works ''' from scipy.stats import norm, entropy # logp from all features X_test = util.convert_sequence2(x, emission=False) X_test = np.squeeze(X_test) final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist()) logp_all = self.ml.loglikelihood(final_ts_obj) # feature-wise conditional probability cond_prob = [] for i in xrange(self.nEmissionDim): # per feature B = copy.copy(self.B) for j in xrange(self.nState): B[j][0] = [b for idx, b in enumerate(B[j][0]) if idx != i] B_arr = copy.copy(B[j][1]) B_arr = np.array(B_arr).reshape( (self.nEmissionDim, self.nEmissionDim)) B_arr = np.delete(B_arr, (i), axis=0) B_arr = np.delete(B_arr, (i), axis=1) B[j][1] = B_arr.flatten().tolist() ml_src = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ self.A, B, self.pi) # logp from remains X_test = util.convert_sequence2([ x[j] for j in xrange(len(x)) if j != i ], \ emission=False) X_test = np.squeeze(X_test) final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist()) logp_src = ml_src.loglikelihood(final_ts_obj) cond_prob.append(logp_all - logp_src) if np.isnan(cond_prob[-1]) or np.isinf(cond_prob[-1]): print "NaN in conditional probabilities: ", np.shape(x) return None return np.array(cond_prob)
def _sequence_from_matrix(m): # Conversion happens as follows: data is a n x m matrix, where n is the number of # samples and m is the number of features per sample. Multivariate data in ghmm is # represented as a single list, where the samples are unrolled. Hence the resulting # data has the following structure: [x_11, x_12, x_13, x21, x22, x23, ...] where m = 3. # Source: http://sourceforge.net/p/ghmm/mailman/message/20578788/ unrolled = m.ravel().tolist() seq = impl.EmissionSequence(impl.Float(), unrolled) return seq
def test_backward_against_ghmm(self): from kerehmm.test.util import ghmm_from_gaussian_hmm import ghmm hmm = self.new_hmm(nDimensions=1, random_emissions=True, random_transitions=True, lower_bounds=[0], upper_bounds=[10]) hmm_reference = ghmm_from_gaussian_hmm(hmm) observation_size = 5 observed = [ np.random.randint(0, 10) for _ in range(self.nDimensions) for i in range(observation_size) ] seq = ghmm.EmissionSequence(hmm_reference.emissionDomain, np.array(observed).flatten().tolist()) # remember that we have to convert stuff from ghmm to log scale _, scale_reference = map(np.array, hmm_reference.forward(seq)) # print "Forward referece", forward print "Scale reference", scale_reference # this is the reference backward array, untransformed (scaled) backward_reference = np.array( hmm_reference.backward(seq, scalingVector=scale_reference)) print "Backward reference (scaled)", backward_reference # unscale the reference array # get the product of scale_t,scale_t+1,...,scale_T for each t. # coefficients = np.array([np.prod(scale_reference[i:]) for i, _ in enumerate(scale_reference)]) coefficients = np.array([ np.multiply.reduce(scale_reference[t + 1:]) for t, _ in enumerate(scale_reference) ]) print "Reference coefficients:", coefficients # multiply each backwards_reference[i] by coefficients[i] backward_reference[:] = (np.expand_dims(coefficients, axis=1) * backward_reference) # test shape print "Backward reference (unscaled)", backward_reference # this is our backward array, log transformed backward = hmm.backward(observed) print "Backward", np.exp(backward) assert backward.shape == backward_reference.shape # test values # print "Diff:", np.exp(backward) - backward_reference backward_unscaled = np.exp(backward) assert np.allclose(backward_unscaled, backward_reference)
def decodeHMM(m, hmmStates): ''' Decode HMM. ''' print >> sys.stderr, printTime(), "Decode HMM for each chromosome." sigma = ghmm.IntegerRange(0, 2) for chrom in hmmStates: print >> sys.stderr, printTime(), chrom state, score = m.viterbi( ghmm.EmissionSequence(sigma, hmmStates[chrom])) hmmStates[chrom] = state print >> sys.stderr, printTime(), "Decode HMM finished." print >> sys.stderr
def predict(self, X): ''' Input @ X: dimension x sample x length #samples x known steps Output @ observation distribution: mu, var #samples x 1 [list] ''' # sample x some length X_test = util.convert_sequence(X, emission=False) mu_l = [] cov_l = [] for i in xrange(len(X_test)): # Past profile final_ts_obj = ghmm.EmissionSequence(self.F, X_test[i].tolist()) try: # alpha: X_test length y #latent States at the moment t when state i is ended # test_profile_length x number_of_hidden_state (alpha, scale) = self.ml.forward(final_ts_obj) alpha = np.array(alpha) scale = np.array(scale) except: print "No alpha is available !!" sys.exit() ## continue f = lambda x: round(x, 12) for j in range(len(alpha)): alpha[j] = map(f, alpha[j]) alpha[-1] = map(f, alpha[-1]) n = len(X_test[i]) t_mu = np.zeros(self.nEmissionDim) t_cov = np.zeros(self.nEmissionDim * self.nEmissionDim) t_sum = 0.0 for j in xrange(self.nState): # N+1 total = np.sum( self.A[:, j] * alpha[n / self.nEmissionDim - 1, :]) #* scaling_factor [mu, cov] = self.B[j] t_mu += np.array(mu) * total t_cov += np.array(cov) * (total**2) t_sum += total mu_l.append(t_mu.tolist()) cov_l.append(t_cov.tolist()) return mu_l, cov_l
def predict(self, X): X = np.squeeze(X) X_test = X.tolist() mu_l = np.zeros(3) cov_l = np.zeros(9) print self.F final_ts_obj = ghmm.EmissionSequence(self.F, X_test) # is it neccessary? try: # alpha: X_test length y # latent States at the moment t when state i is ended # test_profile_length x number_of_hidden_state (alpha, scale) = self.ml.forward(final_ts_obj) alpha = np.array(alpha) except: print "No alpha is available !!" f = lambda x: round(x, 12) for i in range(len(alpha)): alpha[i] = map(f, alpha[i]) alpha[-1] = map(f, alpha[-1]) n = len(X_test) pred_numerator = 0.0 for j in xrange(self.nState): # N+1 total = np.sum( self.A[:, j] * alpha[n / self.nEmissionDim - 1, :]) #* scaling_factor [[mu1, mu2, mu3], [cov11, cov12, cov13, cov21, cov22, cov23, cov31, cov32, cov33]] = self.B[j] ## print mu1, mu2, cov11, cov12, cov21, cov22, total pred_numerator += total mu_l[0] += mu1 * total mu_l[1] += mu2 * total mu_l[2] += mu3 * total cov_l[0] += cov11 * (total**2) cov_l[1] += cov12 * (total**2) cov_l[2] += cov13 * (total**2) cov_l[3] += cov21 * (total**2) cov_l[4] += cov22 * (total**2) cov_l[5] += cov23 * (total**2) cov_l[6] += cov31 * (total**2) cov_l[7] += cov32 * (total**2) cov_l[8] += cov33 * (total**2) return mu_l, cov_l
def loglikelihood(self, X): X = np.squeeze(X) X_test = X.tolist() final_ts_obj = ghmm.EmissionSequence(self.F, X_test) try: p = self.ml.loglikelihood(final_ts_obj) except: if self.verbose: print 'Likelihood error!!!!' sys.exit() return p
def _removeOutliers(models, trainData, outliers): needTrain = False for i in range(len(models)): mean = 0 variance = 0 #Calculate model mean for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) a = abs(models[i].loglikelihood(eSeq)) mean += a try: mean /= (len(trainData[i]) * 1.0) except: continue #Calculate the model variance for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) v = abs(models[i].loglikelihood(eSeq)) variance += (mean - v)**2 variance /= (len(trainData[i]) * 1.0) std = variance**0.5 for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) v = abs(models[i].loglikelihood(eSeq)) if (v - mean) > (2 * std): trainData[i].remove(tmp) outliers.append(tmp) needTrain = True if needTrain: models = _trainModels(trainData, models)
def computeLikelihood(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=1, \ bPosterior=False, converted_X=False, cov_type='full'): ''' This function will be deprecated. Please, use computeLikelihoods. ''' if nEmissionDim >= 2: ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi) if cov_type == 'diag' or cov_type.find('diag') >= 0: ml.setDiagonalCovariance(1) else: ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi) if converted_X is False: X_test = util.convert_sequence(X, emission=False) X_test = np.squeeze(X_test) X_test = X_test.tolist() else: X_test = X l_idx = [] l_likelihood = [] l_posterior = [] for i in xrange(startIdx, len(X_test) / nEmissionDim): final_ts_obj = ghmm.EmissionSequence(F, X_test[:i * nEmissionDim]) try: logp = ml.loglikelihood(final_ts_obj) if bPosterior: post = np.array(ml.posterior(final_ts_obj)) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" l_idx.append(i) l_likelihood.append(-100000000) if bPosterior: if len(l_posterior) == 0: l_posterior.append(list(pi)) else: l_posterior.append(l_posterior[-1]) ## return False, False # anomaly continue l_idx.append(i) l_likelihood.append(logp) if bPosterior: l_posterior.append(post[i - 1]) if bPosterior: return idx, l_idx, l_likelihood, l_posterior else: return idx, l_idx, l_likelihood
def anomaly_check(self, X1, X2=None, X3=None, ths_mult=None): if self.nEmissionDim == 1: X_test = np.array([X1]) else: X_test = self.convert_sequence(X1, X2, X3, emission=False) try: final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist()) logp = self.ml.loglikelihood(final_ts_obj) except: print "Too different input profile that cannot be expressed by emission matrix" return -1, 0.0 # error try: post = np.array(self.ml.posterior(final_ts_obj)) except: print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?" return 1.0, 0.0 # anomaly n = len(np.squeeze(X1)) # Find the best posterior distribution min_dist = 100000000 min_index = 0 print 'Loglikelihood', logp print 'Last posterior', post[n - 1] print 'Computing entropies' for j in xrange(self.nGaussian): dist = entropy(post[n - 1], self.l_statePosterior[j]) print 'Index:', j, 'Entropy:', dist if min_dist > dist: min_index = j min_dist = dist print 'Computing anomaly' print logp print self.ll_mu[min_index] print self.ll_std[min_index] if (type(ths_mult) == list or type(ths_mult) == np.ndarray or type(ths_mult) == tuple) and len(ths_mult) > 1: err = logp - (self.ll_mu[min_index] + ths_mult[min_index] * self.ll_std[min_index]) else: err = logp - (self.ll_mu[min_index] + ths_mult * self.ll_std[min_index]) print 'Error', err if err < 0.0: return 1.0, 0.0 # anomaly else: return 0.0, err # normal
def trainHMM(hmmState): ''' Train HMM with the given chromosome. ''' print >> sys.stderr, printTime(), "Train HMM with one chromosome." T = [[0.9, 0.1], [0.1, 0.9]] e1 = [0.1, 0.9] e0 = [0.9, 0.1] E = [e0, e1] pi = [0.9, 0.1] # initial 10% are peak? sigma = ghmm.IntegerRange(0, 2) # 0, 1 m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), T, E, pi) m.baumWelch(ghmm.EmissionSequence(sigma, hmmState)) print >> sys.stderr, printTime(), "Train HMM finished." print >> sys.stderr return m
def state_clustering(self, X1, X2, X3, X4): n,m = np.shape(X1) print n,m x = np.arange(0., float(m))*(1./43.) state_mat = np.zeros((self.nState, m*n)) likelihood_mat = np.zeros((1, m*n)) count = 0 for i in xrange(n): for j in xrange(1,m): x_test1 = X1[i:i+1,:j] x_test2 = X2[i:i+1,:j] x_test3 = X3[i:i+1,:j] x_test4 = X4[i:i+1,:j] X_test = self.convert_sequence(x_test1, x_test2, x_test3, x_test4, emission=False) final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist()) ## path,_ = self.ml.viterbi(final_ts_obj) post = self.ml.posterior(final_ts_obj) logp = self.ml.loglikelihood(final_ts_obj) state_mat[:, count] = np.array(post[j-1]) likelihood_mat[0,count] = logp count += 1 # k-means init_center = np.eye(self.nState, self.nState) self.km = KMeans(self.nState, init=init_center) idx_list = self.km.fit_predict(state_mat.transpose()) # mean and variance of likelihoods l = [] for i in xrange(self.nState): l.append([]) for i, idx in enumerate(idx_list): l[idx].append(likelihood_mat[0][i]) l_mean = [] l_std = [] for i in xrange(self.nState): l_mean.append( np.mean(l[i]) ) l_std.append( np.std(l[i]) ) return l_mean, l_std
def viterbi(self, hmm, obj): hmm_ = self._get_hmm(hmm) if isinstance(obj, SequenceSet): obj = [array_flatten(s[:]) for s in obj] obj = ghmm.SequenceSet(DOMAIN, obj) res = hmm_.viterbi(obj) # ghmm returns a scalar even though a sequence set was passed # if length == 1 but we want an array if len(obj) == 1: res = [[res[0]], [res[1]]] else: obj = ghmm.EmissionSequence(DOMAIN, array_flatten(obj[:])) res = hmm_.viterbi(obj) return res
def dist(self, data): """dist(pattern) Calculate the distance between piece of data and the model. This is an absurd distance function as the closest distance is the greatest value of this function. Also the function can be greater than or less than zero. """ eSeq = ghmm.EmissionSequence(ghmm.Float(), data) tmp = self.model.loglikelihood(eSeq) try: tmp/1 except Exception, e: print "In exception" tmp = -1000