def newModel(states, randomize = True, startAtFirstState = False, \ feedForward = True): """newModel(states, obs, sigma) Make a new random model. """ pi = [1.0 / states] * states if startAtFirstState: pi = [0] * states pi[0] = 1 aMat = numpy.zeros((states, states), float) bMat = numpy.zeros((states, 2), float) if randomize: for i in range(states): for j in range(states): aMat[i][j] = random.random() if feedForward and (j != i + 1): aMat[i][j] = 0 if feedForward and (j == i + 1): aMat[i][j] = 1 for j in range(2): bMat[i][j] = random.random() aMat += 0.01 bMat += 0.01 m = ghmm.HMMFromMatrices(ghmm.Float(), \ ghmm.GaussianDistribution(ghmm.Float()), \ aMat, bMat, pi) return m
def _includeOutliers(models, trainData, outliers): means = [] stds = [] for i in range(len(models)): mean = 0 variance = 0 #Calculate model mean for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) a = abs(models[i].loglikelihood(eSeq)) #print a mean += a mean /= (len(trainData[i]) * 1.0) means.append(mean) #Calculate the model variance for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) v = abs(models[i].loglikelihood(eSeq)) variance += (mean - v)**2 variance /= (len(trainData[i]) * 1.0) std = variance**0.5 stds.append(std) #For each data element in outliers, check for the model that it most #fits. If the outlier fits the model within one standard deviation #include it back into the data. for tmp in outliers: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) best = -1 bestModel = -1 for j in range(len(models)): val = abs(models[j].loglikelihood(eSeq)) if val < best or best == -1: best = val bestModel = j #Determine if the best fit is "good" enough. #If it is, add the outlier back into the model if (best - means[bestModel]) < 1 * (stds[bestModel]): trainData[bestModel].append(tmp) outliers.remove(tmp) return trainData, outliers
def forecast(self, data, future = 1): """Forecast for a model the probability of each observation. equation is: p(o_t+1) = sum_j(p(o_t+1|s_t+1^j)p(s_t+1^j) where p(s_t+1^j) is found through forward algorithm """ state = self.model.asMatrices()[0] observe = self.model.asMatrices()[1] ps1 = [0.0] * len(state[0]) po1 = [0.0] * len(observe[0]) tmp = ghmm.EmissionSequence(ghmm.Float(), data) ps = self.model.forward(tmp)[0][-1] for j in range(len(ps1)): for i in range(len(ps)): ps1[j] += state[i][j] * ps[i] for k in range(len(po1)): for j in range(len(ps1)): po1[k] += observe[j][k]*ps1[j] return po1[0]
def __init__(self, preprocess_args, metric, graph_structure_type, A, B, pi, win_len, thresh, min_peak_dist): """ Args: preprocess_args: metric: graph_structure_type: "predefined", "fully", "left_to_right" A: initial hidden states graph B: initial hidden states distribution pi: initial hidden states probabilities win_len: windows lengths of the sliding window offline thresh: in the peak detection, detect peaks that are greater than thresh min_peak_dist: in the peak detection, detect peaks that are at least separated by minimum peak distance """ self.preprocess_args = preprocess_args self.metric = metric self.graph_structure_type = graph_structure_type self.A = A self.B = B self.pi = pi self.win_len = win_len self.thresh = thresh self.min_peak_dist = min_peak_dist self.emission_domain = ghmm.Float() self.emission_distr = ghmm.GaussianDistribution(self.emission_domain)
def __init__(self, nState, nEmissionDim=1, check_method='progress', anomaly_offset=0.0, \ cluster_type='time', verbose=False): self.ml = None ## Tunable parameters self.nState = nState # the number of hidden states self.nGaussian = nState self.nEmissionDim = nEmissionDim self.verbose = verbose ## Un-tunable parameters self.trans_type = 'left_right' # 'left_right' 'full' self.A = None # transition matrix self.B = None # emission matrix self.pi = None # Initial probabilities per state self.check_method = check_method # ['global', 'progress'] self.cluster_type = cluster_type self.l_statePosterior = None self.ll_mu = None self.ll_std = None self.l_mean_delta = None self.l_std_delta = None self.l_mu = None self.l_std = None self.std_coff = None self.anomaly_offset = anomaly_offset # emission domain of this model self.F = ghmm.Float()
def __init__(self, Fmat_train, Fmat_test, categories, train_per_category, test_per_category): self.F = ghmm.Float() # emission domain of HMM model self.Fmat_train = Fmat_train self.Fmat_test = Fmat_test self.categories = categories self.train_trials_per_category = train_per_category self.test_trials_per_category = test_per_category
def __init__(self, nState, nFutureStep=5, nCurrentStep=10, nEmissionDim=3, check_method='progress'): self.ml = None ## Tunable parameters self.nState = nState # the number of hidden states self.nGaussian = nState self.nFutureStep = nFutureStep self.nCurrentStep = nCurrentStep self.nEmissionDim = nEmissionDim ## Un-tunable parameters self.trans_type = 'left_right' # 'left_right' 'full' self.A = None # transition matrix self.B = None # emission matrix self.pi = None # Initial probabilities per state self.check_method = check_method # ['global', 'progress'] self.l_statePosterior = None self.ll_mu = None self.ll_std = None self.l_mean_delta = None self.l_std_delta = None self.l_mu = None self.l_std = None self.std_coff = None # emission domain of this model self.F = ghmm.Float() print 'HMM initialized for', self.check_method
def __init__(self, aXData, nState, nMaxStep, nFutureStep=5, nCurrentStep=10, step_size_list=None, trans_type="left_right"): learning_base.__init__(self, aXData, trans_type) ## Tunable parameters self.nState= nState # the number of hidden states self.nFutureStep = nFutureStep self.nCurrentStep = nCurrentStep self.step_size_list = step_size_list ## Un-tunable parameters ## self.trans_type = trans_type #"left_right" #"full" self.nMaxStep = nMaxStep # the length of profile self.obsrv_range = [np.min(aXData), np.max(aXData)] self.A = None # transition matrix self.B = None # emission matrix # emission domain of this model self.F = ghmm.Float() # Assign local functions learning_base.__dict__['fit'] = self.fit learning_base.__dict__['predict'] = self.predict learning_base.__dict__['score'] = self.score pass
def get_hidden_markov_model(mixture_model, guess_t_matrix): """Get an (unoptomized) hidden markov model from the mixture model and a guess at the transition matrix. The guess transition matrix is typically created by summing over the outer product of time-pairs of membership vectors. """ # Emission probabilities for HMM, using their very silly # matrix arrangement emissions = [[mixture_model.means_[j], mixture_model.covars_[j].flatten()] for j in xrange(mixture_model.n_components)] # Initial transition matrix if isinstance(guess_t_matrix, scipy.sparse.csr.csr_matrix): guess_t_matrix = guess_t_matrix.todense() guess_t_matrix = guess_t_matrix.tolist() # Initial occupancy # Todo: figure out if initial occupancy matters initial_occupancy = ([1.0 / mixture_model.n_components] * mixture_model.n_components) # Set up distribution g_float = ghmm.Float() g_distribution = ghmm.MultivariateGaussianDistribution(g_float) # Put it all together model = ghmm.HMMFromMatrices(g_float, g_distribution, guess_t_matrix, emissions, initial_occupancy) return model
def _sequence_from_matrix(m): # Conversion happens as follows: data is a n x m matrix, where n is the number of # samples and m is the number of features per sample. Multivariate data in ghmm is # represented as a single list, where the samples are unrolled. Hence the resulting # data has the following structure: [x_11, x_12, x_13, x21, x22, x23, ...] where m = 3. # Source: http://sourceforge.net/p/ghmm/mailman/message/20578788/ unrolled = m.ravel().tolist() seq = impl.EmissionSequence(impl.Float(), unrolled) return seq
def test_ghmm(self): # this is being extended to also support mixtures of multivariate gaussians # Interpretation of B matrix for the multivariate gaussian case # (Example with three states and two mixture components with two dimensions): # B = [ # [["mu111","mu112"],["sig1111","sig1112","sig1121","sig1122"], # ["mu121","mu122"],["sig1211","sig1212","sig1221","sig1222"], # ["w11","w12"] ], # [["mu211","mu212"],["sig2111","sig2112","sig2121","sig2122"], # ["mu221","mu222"],["sig2211","sig2212","sig2221","sig2222"], # ["w21","w22"] ], # [["mu311","mu312"],["sig3111","sig3112","sig3121","sig3122"], # ["mu321","mu322"],["sig3211","sig3212","sig3221","sig3222"], # ["w31","w32"] ], # ] # # ["mu311","mu312"] is the mean vector of the two dimensional # gaussian in state 3, mixture component 1 # ["sig1211","sig1212","sig1221","sig1222"] is the covariance # matrix of the two dimensional gaussian in state 1, mixture component 2 # ["w21","w22"] are the weights of the mixture components # in state 2 # For states with only one mixture component, a implicit weight # of 1.0 is assumed import ghmm F = ghmm.Float() Abig = [[0.0, 1.0], [1.0, 0.0]] Bbig = [[[1.0, 1.0, 1.0], [0.9, 0.4, 0.2, 0.4, 2.2, 0.5, 0.2, 0.5, 1.0]], [[10.0, 10.0, 10.0], [1.0, 0.2, 0.8, 0.2, 2.0, 0.6, 0.8, 0.6, 0.9]]] piBig = [0.5, 0.5] modelBig = ghmm.HMMFromMatrices( F, ghmm.MultivariateGaussianDistribution(F), Abig, Bbig, piBig) modelBig.sample(10, 100, seed=3586662) e = modelBig.sampleSingle(1) print[x for x in e] # get log P(seq | model) logp = model.loglikelihood(seq) print logp # cacluate viterbi path path = model.viterbi(seq) print path # train model parameters model.baumWelch(seq_set, 500, 0.0001)
def _removeOutliers(models, trainData, outliers): needTrain = False for i in range(len(models)): mean = 0 variance = 0 #Calculate model mean for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) a = abs(models[i].loglikelihood(eSeq)) mean += a try: mean /= (len(trainData[i]) * 1.0) except: continue #Calculate the model variance for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) v = abs(models[i].loglikelihood(eSeq)) variance += (mean - v)**2 variance /= (len(trainData[i]) * 1.0) std = variance**0.5 for tmp in trainData[i]: eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp) v = abs(models[i].loglikelihood(eSeq)) if (v - mean) > (2 * std): trainData[i].remove(tmp) outliers.append(tmp) needTrain = True if needTrain: models = _trainModels(trainData, models)
def finalize(self): cmodel = self.HMM.finalize() if (self.modeltype & ghmmwrapper.kContinuousHMM): return ghmm.ContinuousMixtureHMM( ghmm.Float(), ghmm.ContinuousMixtureDistribution(ghmm.Float()), cmodel) elif ((self.modeltype & ghmmwrapper.kDiscreteHMM) and not (self.modeltype & ghmmwrapper.kTransitionClasses) and not (self.modeltype & ghmmwrapper.kPairHMM)): emission_domain = ghmm.Alphabet([], cmodel.alphabet) if (self.modeltype & ghmmwrapper.kLabeledStates): labelDomain = ghmm.LabelDomain([], cmodel.label_alphabet) return ghmm.StateLabelHMM( emission_domain, ghmm.DiscreteDistribution(emission_domain), labelDomain, cmodel) else: return ghmm.DiscreteEmissionHMM( emission_domain, ghmm.DiscreteDistribution(emission_domain), cmodel)
def _trainModels(tdata, models): """Train models using every data element designated from the _assign functions. Note: this function is independent from the type of data split used. """ for i in range(len(models)): #Create a sequence set used for training from the multiple observations seqSet = ghmm.SequenceSet(ghmm.Float(), []) for tmpData in tdata[i]: seqSet.merge(ghmm.EmissionSequence(ghmm.Float(), tmpData)) #Make average sequence s = numpy.array(tdata[i]) nm = hmmsup.obsToModel(s.mean(axis=0), max(s.std(axis=0))) nm.normalize() nm.baumWelch(seqSet) models[i] = nm #models[i].baumWelch(seqSet)#, loglikelihoodCutoff = 0.000001) hmmsup.normalizeAMat(models[i]) hmmsup.normalizePiMat(models[i]) return models
def ghmm_from_gaussian_hmm(hmm): hmm = deepcopy(hmm) domain = ghmm.Float() trans = hmm.transitionMatrix.tolist() init = hmm.initialProbabilities.tolist() emissions = [map(float, [d.mean, d.variance]) for d in hmm.emissionDistributions] # print init # print trans # print emissions return ghmm.HMMFromMatrices(emissionDomain=domain, distribution=ghmm.GaussianDistribution(domain), A=trans, B=emissions, pi=init)
def ghmm_from_multivariate_continuous_hmm(hmm): hmm = deepcopy(hmm) domain = ghmm.Float() trans = hmm.transitionMatrix.tolist() init = hmm.initialProbabilities.tolist() emissions = [[d.mean.tolist(), d.variance.flatten().tolist()] for d in hmm.emissionDistributions] # print init # print trans # print emissions return ghmm.HMMFromMatrices(emissionDomain=domain, distribution=ghmm.MultivariateGaussianDistribution(domain), A=trans, B=emissions, pi=init)
def _new_model(n_features, n_states, means, covars, topology): # Generate emissions emissions = [] for i in range(n_states): emission = [means[i].tolist(), covars[i].ravel().tolist()] emissions.append(emission) # Create model domain = impl.Float() transitions = transition_matrix(n_states, topology).tolist() pi = start_probabilities(n_states, topology) distribution = impl.MultivariateGaussianDistribution(domain) model = impl.HMMFromMatrices(domain, distribution, transitions, emissions, pi) return model
def obsToModel(observation, std=0.1): """Makes a model from a single observation vector. """ aMat = numpy.zeros((len(observation), len(observation)), float) bMat = numpy.zeros((len(observation), 2), float) pi = [0.05] * len(observation) pi[0] = 1.0 for i in range(len(observation)): bMat[i][0] = observation[i] bMat[i][1] = std for j in range(len(observation)): aMat[i][j] = random.random() * 0.3 if j == i + 1: aMat[i][j] = 0.9 m = ghmm.HMMFromMatrices(ghmm.Float(), \ ghmm.GaussianDistribution(ghmm.Float()), \ aMat, bMat, pi) m.normalize() return m
def dist(self, data): """dist(pattern) Calculate the distance between piece of data and the model. This is an absurd distance function as the closest distance is the greatest value of this function. Also the function can be greater than or less than zero. """ eSeq = ghmm.EmissionSequence(ghmm.Float(), data) tmp = self.model.loglikelihood(eSeq) try: tmp/1 except Exception, e: print "In exception" tmp = -1000
def hmmDist(pattern, model): """hmmDist(pattern, cluster, sigma) Calculate the distance between a single pattern and the given cluster. This is an odd distance metric, because the greater the number the close two elements are together. I should fix this. """ eSeq = ghmm.EmissionSequence(ghmm.Float(), pattern) tmp = model.loglikelihood(eSeq) try: tmp / 1 except Exception, e: print "In exception" tmp = -1000
def _randomModels(k, states): """Make a set of k random models. These models are untrained with initial random values for all model matricies. """ f = ghmm.Float() pi = [0.1] * states aMat = numpy.zeros((states, states), float) bMat = numpy.zeros((states, 2), float) #TODO Change above for multivariate Gaussians models = [] for n in range(k): for i in range(states): for j in range(states): aMat[i][j] = random.random() for j in range(2): bMat[i][j] = random.random() m = ghmm.HMMFromMatrices(f, ghmm.GaussianDistribution(f), \ aMat, bMat, pi) models.append(m) return models
def __init__(self, nState=10, nEmissionDim=4, verbose=False): ''' This class follows the policy of sklearn as much as possible. TODO: score function. NEED TO THINK WHAT WILL BE CRITERIA. ''' # parent class that provides sklearn related interfaces. learning_base.__init__(self) self.ml = None self.verbose = verbose ## Tunable parameters self.nState = nState # the number of hidden states self.nEmissionDim = nEmissionDim ## Un-tunable parameters self.trans_type = 'left_right' # 'left_right' 'full' self.A = None # transition matrix self.B = None # emission matrix self.pi = None # Initial probabilities per state # emission domain of this model self.F = ghmm.Float()
def __init__(self, Fmat_train, Fmat_test, categories): self.F = ghmm.Float() # emission domain of HMM model self.Fmat_train = Fmat_train self.Fmat_test = Fmat_test self.train_trials_per_category = np.size(Fmat_train,0)/np.size(categories) self.test_trials_per_category = np.size(Fmat_test,0)/np.size(categories)
def _kMeans(data, k, states, iterations = 20, stopThreshold = 0.01, \ rOutliers = True, printBest = True, verbose = True, \ iType = "kmeans++"): bestScore = -100 bestModels = None bestData = None oldScore = -100 models = [] tdata = _randomAssign(data, k) if iType == "random": models = _randomModels(k, states) models = _trainModels(tdata, models) if iType == "kmeans++": models = _initializeGoodModels(data, k, states) tdata = _optimalAssign(tdata, models) outliers = [] for i in range(iterations): models = _trainModels(tdata, models) score = _fitness(tdata, models) if verbose: print " " + str(i) + ": " + str(score) if (score > bestScore) or (bestScore == -100): bestScore = score bestModels = list(ghmm.HMMFromMatrices(ghmm.Float(), \ ghmm.GaussianDistribution(ghmm.Float()), \ m.asMatrices()[0], \ m.asMatrices()[1], \ m.asMatrices()[2]) for m in models) bestData = list(list(v) for v in tdata) bestOutliers = list(outliers) if (oldScore == -100) or (score - oldScore) > stopThreshold: tdata = _optimalAssign(tdata, models) oldScore = score if rOutliers: _removeOutliers(models, tdata) else: if verbose: print "Resetting all" tdata = _randomAssign(data, k) if iType == "random": models = _randomModels(k, states) models = _trainModels(tdata, models) if iType == "kmeans++": models = _initializeGoodModels(data, k, states) tdata = _optimalAssign(tdata, models) oldScore = -100 if printBest or verbose: print "Average inter-cluster distance:" + str(bestScore) if rOutliers: if verbose: print "Number outliers found:" + str(len(bestOutliers)) #For the best set of models and train data try to include any outliers #again. Then return the models, data and outliers. bestData, bestOutliers = _includeOutliers(bestModels, bestData, bestOutliers) bestModels = _trainModels(bestData, bestModels) bestData = _optimalAssign(bestData, bestModels) score = _fitness(bestData, bestModels) if printBest or verbose: print "Score with additional outliers:" + str(score) if verbose: print "New number of outliers:" + str(len(bestOutliers)) import pybb.model.hmm bm = [] for m in bestModels: bm.append(pybb.model.hmm.Hmm(m)) return bm, bestData, bestOutliers
del c[3] # delete transition from state 1 to state 2 del c[(1, 2)] hmm = c.finalize() print c c = HMMEditingContext("test.xml") hmm = c.finalize() print "\n###############################\n" print c #print hmm.verbose_str() print "\n###############################\n" # CONTINOUS Model c = HMMEditingContext(ghmm.Float()) c = HMMEditingContext(GaussianDistribution) c.addState((0.0, 1.0)) c.addState((0.1, 0.3), initial=0.3) c.addState() # uniform c.addTransition('1', '2') c.addTransition('3', '2', p=0.1) print c c.finalize() print c c = HMMEditingContext(ghmm.Float()) c.addState(GaussianDistribution(0.0, 1.0))
def loglikelihood(self, data): tmp = ghmm.EmissionSequence(ghmm.Float(), data) return self.model.loglikelihood(tmp)
def F(self): return ghmm.Float()
# normalize so that the sum of probabilities # always equals 1.0 for j in range(n_states): trans_mat[i][j] /= sum_ assert_almost_equals(sum(trans_mat[i]), 1.0) hmm.pi = pi hmm.A = trans_mat hmm.B = opdfs try: import ghmm DOMAIN = ghmm.Float() class _GhmmBase(object): def _get_hmm(self, hmm): return ghmm.HMMFromMatrices( DOMAIN, ghmm.MultivariateGaussianDistribution(DOMAIN), hmm.A, hmm.B, hmm.pi) class GhmmBaumWelchTrainer(_GhmmBase): def train(self, hmm, sset): sset = [array_flatten(s) for s in sset] hmm_ = self._get_hmm(hmm) hmm_.baumWelch(ghmm.SequenceSet(DOMAIN, sset)) hmm.A, hmm.B, hmm.pi = hmm_.asMatrices() class GhmmViterbiCalculator(_GhmmBase):
###------------------------------------------------ # output files output_all = open(prefix + 'all_tails.txt', 'w') # tail lengths of all tags output_states = open(prefix + 'hmm_states.txt', 'w') # HMM states of all tags output_median = open(prefix + 'median_tails_tags.txt', 'w') # median tail lengths, aggregated by genes output_mean = open(prefix + 'mean_tails_tags.txt', 'w') # mean tail lengths, aggregated by genes out_hmm = prefix + 'HMM_model.txt' # HMM model ###------------------------------------------------ # initializes a gaussian hidden markov model and defines # the tranisition, emission, and starting probabilities print('\nTraining data with hmm...' + timer()) F = ghmm.Float() pi = [1.0, 0.0, 0.0, 0.0, 0.0] # initial state if params['allow_back'] == True: # The following matrix allows T states going back to non=T states. Transitionmatrix = [[0.04, 0.93, 0.02, 0.01, 0.0], [0.0, 0.87, 0.1, 0.02, 0.01], [0.0, 0.05, 0.6, 0.3, 0.05], [0.0, 0.01, 0.3, 0.6, 0.09], [0.0, 0.01, 0.01, 0.1, 0.88]] else: # The following matrix does not allow states going backwards. Transitionmatrix = [[0.04, 0.93, 0.02, 0.01, 0.0], [0.0, 0.94, 0.03, 0.02, 0.01], [0.0, 0.0, 0.5, 0.4, 0.1], [0.0, 0.0, 0.0, 0.6, 0.4],
def _sequence_set_from_list(l): # Conversion is similar to _sequence_from_data but here data is a list. unrolled = [matrix.ravel().tolist() for matrix in l] seq = impl.SequenceSet(impl.Float(), unrolled) return seq