def train(self, model, flag): index = np.size(self.Fmat_train,0)/np.size(categories) if flag == 'Missed': final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[0:index]) elif flag == 'Good': final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[index:2*index]) elif flag == 'High': final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[2*index:3*index]) elif flag == 'Caught': final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[3*index:4*index]) model.baumWelch(final_ts) return model
def train(self, training_file, epsilon=0.0001, max_iter=500): """ Train the TFFM using the fasta sequences to learn emission and transition probabilities. :note: The training of the underlying HMM is made using the Baum-Welsh algorithm. :arg training_file: The fasta file of the sequences to train the TFFM on. :type training_file: str :arg epsilon: The least relative improvement cut-off in likelihood compared to the previous iteration of the Baum-Welsh algorithm (default: 0.0001). :type epsilon: float :arg max_iter: The maximum number of iteration of the Baum-Welsh algorithm to reestimate the probabilities (default: 500). :type max_iter: int """ assert(os.path.isfile(training_file)) # Only upper case is allowed in the ALPHABET, need to convert sequences = [] for record in SeqIO.parse(training_file, "fasta"): sequence = record.seq.upper() # Only considering sequences with ACGTs if not re.search("[^AGCT]", str(sequence)): sequences.append(sequence) training_sequences = ghmm.SequenceSet(ghmm.Alphabet(ALPHABET), sequences) # Need to give the same weight to all the sequences since it does not # seem to be done by default by ghmm. utils.set_sequences_weight(training_sequences, 1.0) self.baumWelch(training_sequences, max_iter, epsilon)
def _get_posterior_proba(self, sequence_split): """ Get the posterior probabilities at each nucleotide position given the TFFM. :arg sequence_split: The sequence splitted in subsequences to not consider non ACGT nucleotides. :type sequence_split: list :returns: The posterior probabilities at each position of the sequence. :rtype: list of list :note: One example of a sequence_split is ["ACT", "N", "ATC"]. """ ghmm_extended_alphabet = ghmm.Alphabet(EXTENDED_ALPHABET) posterior_proba = [] # null probabilities for non ACGT nucleotides. null_proba = [0.] * self.N for sequence in sequence_split: if re.match("[ACGT]", sequence): emission_sequence = ghmm.SequenceSet(ghmm_extended_alphabet, [sequence])[0] posterior_proba.extend(self.posterior(emission_sequence)) else: for __ in xrange(len(sequence)): posterior_proba.append(null_proba) return posterior_proba
def __gestures_markov_sequences_from_demos(self, gesture): emission_sequences = [] for demo_index in xrange(gesture.demonstration_count): training_data = gesture.get_training_data(demo_index=demo_index) seq = self.__get_sequence_values_from_training_data(training_data) emission_sequences.append(seq) return ghmm.SequenceSet(self.F, emission_sequences)
def _create_sequence_set(self, qsr_seq, symbols): """Creating a sequence set for training :param qsr_seq: the observation seqence of symbols according to the alphabet as a list of lists :param symbols: the alphabet of possible symbols :return: the sequence set for the given observations """ return gh.SequenceSet(symbols, qsr_seq)
def trainModel(self, trainingData): # Expecting training data in the form of a list of malwareSignatures if len( trainingData ) < 1: prettyPrint("Empty training set provided", "error") return False # Now use the Baum-Welch algorithm self.ghmmModel.baumWelch(ghmm.SequenceSet(self.ghmmModel.emissionDomain, trainingData)) #print self.ghmmModel return True
def fit(self, X_train, A=None, B=None, pi=None, B_dict=None, verbose=False): if A is None: if verbose: print "Generate new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = self.init_trans_mat(self.nState).tolist() if B is None: if verbose: print "Generate new B matrix" # We should think about multivariate Gaussian pdf. self.mu, self.sig = self.vectors_to_mean_sigma(X_train, self.nState) # Emission probability matrix B = np.hstack([self.mu, self.sig]).tolist() # Must be [i,:] = [mu, sig] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.] * self.nState pi[0] = 1.0 # HMM model object self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), A, B, pi) ## print "Run Baum Welch method with (samples, length)", X_train.shape train_seq = X_train.tolist() final_seq = ghmm.SequenceSet(self.F, train_seq) self.ml.baumWelch(final_seq, 10000) [self.A,self.B,self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) ## self.mean_path_plot(mu[:,0], sigma[:,0]) ## print "Completed to fitting", np.array(final_seq).shape # state range self.state_range = np.arange(0, self.nState, 1) # Pre-computation for PHMM variables self.mu_z = np.zeros((self.nState)) self.mu_z2 = np.zeros((self.nState)) self.mu_z3 = np.zeros((self.nState)) self.var_z = np.zeros((self.nState)) self.sig_z3 = np.zeros((self.nState)) for i in xrange(self.nState): zp = self.A[i,:]*self.state_range self.mu_z[i] = np.sum(zp) self.mu_z2[i] = self.mu_z[i]**2 #self.mu_z3[i] = self.mu_z[i]**3 self.var_z[i] = np.sum(zp*self.state_range) - self.mu_z[i]**2
def toSequenceSetBlocks(self, peptideList): ''' Converts a list of peptides given as strings to GHMM format. As GHMM has a limitation of max 1,500,000 sequences per sequence set, longer peptide lists are split into blocks. The block size is configurable via self.sequencesPerBlock to facilitate testing. Returns a list of sequence sets, preserving the original order of the peptides. ''' # split into blocks lenPeptides = len(peptideList) numFullBlocks = lenPeptides // self.sequencesPerBlock lenLastBlock = lenPeptides % self.sequencesPerBlock sequenceSets = [] # full blocks (if any) for i in range(0, numFullBlocks): rangeStart = i * self.sequencesPerBlock rangeEnd = rangeStart + self.sequencesPerBlock sequenceBlock = ghmm.SequenceSet( self.alphabet, [list(p) for p in peptideList[rangeStart:rangeEnd]]) sequenceSets.append(sequenceBlock) # the last partial block (if any) rangeStart = numFullBlocks * self.sequencesPerBlock rangeEnd = lenPeptides if (lenLastBlock > 0): sequenceBlock = ghmm.SequenceSet( self.alphabet, [list(p) for p in peptideList[rangeStart:rangeEnd]]) sequenceSets.append(sequenceBlock) return sequenceSets
def viterbi(self, hmm, obj): hmm_ = self._get_hmm(hmm) if isinstance(obj, SequenceSet): obj = [array_flatten(s[:]) for s in obj] obj = ghmm.SequenceSet(DOMAIN, obj) res = hmm_.viterbi(obj) # ghmm returns a scalar even though a sequence set was passed # if length == 1 but we want an array if len(obj) == 1: res = [[res[0]], [res[1]]] else: obj = ghmm.EmissionSequence(DOMAIN, array_flatten(obj[:])) res = hmm_.viterbi(obj) return res
def train(self, model, flag, accum_idx): input_data = [] start = 0 end = 0 if flag == exp_list[0]: start = 0 end = accum_idx[0] elif flag == exp_list[1]: start = accum_idx[0] end = accum_idx[1] elif flag == exp_list[2]: start = accum_idx[1] end = accum_idx[2] elif flag == exp_list[3]: start = accum_idx[2] end = accum_idx[3] for i in range(start, end): input_data.append(self.Fmat_train[i].flatten().tolist()) ''' if (flag == exp_list[1]): pp.figure(3) print len(self.Fmat_train), start, end for i in range(start, end): #pp.plot(np.transpose(self.Fmat_train[i])[0], np.transpose(self.Fmat_train[i])[1], 'o') pp.plot(np.transpose(self.Fmat_train[i])[0]) pp.show() abc ''' final_ts = ghmm.SequenceSet(self.F, input_data) #final_ts = model.sample(10, 50) # print the input data print final_ts model.baumWelch(final_ts) # print the optimized model #print model if math.isnan(model.getInitial(0)): print 'model has nan' abc #abc return model
def train(self, X): """Uses GHMM's implementation of Baum-Welch to train an HMM""" try: if len(X) < 1: prettyPrint("Empty training set provided", "warning") return False # Now use the Baum-Welch algorithm self.ghmmModel.baumWelch( ghmm.SequenceSet(self.ghmmModel.emissionDomain, X)) self.isTrained = True if verboseON(): print "Trained model: %s" % self.ghmmModel except Exception as e: prettyPrintError(e) return False return True
def test_viterbi_against_hmm(self): from kerehmm.test.util import ghmm_from_discrete_hmm import ghmm hmm = self.new_hmm() hmm.setup_strict_left_to_right(set_emissions=True) domain = ghmm.Alphabet(range(hmm.alphabetSize)) hmm_reference = ghmm_from_discrete_hmm(hmm) seq = list(range(self.nSymbols)) print "True path and emission: {}".format(seq) true_path = seq reference_path, reference_prob = hmm_reference.viterbi( ghmm.SequenceSet(domain, [seq])) path, prob = hmm.viterbi_path(seq) print "Reference path: {}".format(reference_path) print "Calculated path: {}".format(path) print "Reference prob: {}, Calculated prob: {}".format( reference_prob, prob) assert np.all(np.equal(true_path, reference_path)) assert np.all(np.equal(true_path, path)) assert np.isclose(prob, reference_prob)
def perform_optimization(hidden_mm, trajs, lag_time, sliding_window=True): """Optimize a hidden markov model given a list of trajectories. Use the Baum-Welch algorithm for learning the transition matrix, fixing emission probabilities. """ # Domains for our multivariate gaussians domain = hidden_mm.emissionDomain # Do sliding window if sliding_window: # A naive way of doing this is by making many trajectories slides = xrange(lag_time) lagged_trajs = list() for i in xrange(len(trajs)): traj = trajs[i] for slide in slides: lagged_trajs.append(traj[slide::lag_time]) else: lagged_trajs = [t[::lag_time] for t in trajs] # Prepare the trajectories by flattening them to 1D prepared_trajs = [t.flatten().tolist() for t in lagged_trajs] # Build the c-style sequences object manually (seq_c, lengths) = ghmmhelper.list2double_matrix(prepared_trajs) lengths_c = ghmmwrapper.list2int_array(lengths) cseq = ghmmwrapper.ghmm_cseq(seq_c, lengths_c, len(trajs)) # Make a SequenceSet wrapper around the c-style object train_seq = ghmm.SequenceSet(domain, cseq) # Perform the Baum Welch optimization likelihood = hidden_mm.baumWelch(train_seq, nrSteps=10000000, loglikelihoodCutoff=1.0e-5) print "Final baum welch likelihood: {}".format(likelihood) return likelihood, hidden_mm
def partial_fit(self, xData, learningRate=0.2, nrSteps=1, max_iter=100): ''' Online update of HMM using online Baum-Welch algorithm ''' X = [np.array(data) for data in xData] nData = len(X[0]) # print 'Creating Training Data' X_train = util.convert_sequence(X) # Training input X_train = X_train.tolist() if self.verbose: print 'Run Baum Welch method with (samples, length)', np.shape( X_train) if learningRate < 1e-5: learningRate = 1e-5 final_seq = ghmm.SequenceSet(self.F, X_train) for i in xrange(max_iter): ret = self.ml.baumWelch(final_seq, nrSteps=nrSteps, learningRate=learningRate) if np.isnan(ret): print 'Baum Welch return:', ret return 'Failure' if i > 0: if abs(last_ret - ret) < 1.0: print "Partial fitting is converged to ", ret, " from ", last_ret break last_ret = ret print 'Baum Welch return:', ret / float(nData) [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) return ret
def _trainModels(tdata, models): """Train models using every data element designated from the _assign functions. Note: this function is independent from the type of data split used. """ for i in range(len(models)): #Create a sequence set used for training from the multiple observations seqSet = ghmm.SequenceSet(ghmm.Float(), []) for tmpData in tdata[i]: seqSet.merge(ghmm.EmissionSequence(ghmm.Float(), tmpData)) #Make average sequence s = numpy.array(tdata[i]) nm = hmmsup.obsToModel(s.mean(axis=0), max(s.std(axis=0))) nm.normalize() nm.baumWelch(seqSet) models[i] = nm #models[i].baumWelch(seqSet)#, loglikelihoodCutoff = 0.000001) hmmsup.normalizeAMat(models[i]) hmmsup.normalizePiMat(models[i]) return models
def fit(self, xData1, A=None, B=None, pi=None, cov_mult=[1.0]*1, verbose=False, \ ml_pkl='ml_temp_1d.pkl', use_pkl=False): ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl) X1 = np.array(xData1) if A is None: if verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = self.init_trans_mat(self.nState).tolist() # print 'A', A if B is None: if verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mu, sig = self.vectors_to_mean_sigma(X1, self.nState) B = np.vstack([mu, sig * cov_mult[0] ]).T.tolist() # Must be [i,:] = [mu, sig] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # print 'Generating HMM' # HMM model object self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), A, B, pi) X_train = X1.tolist() print 'Run Baum Welch method with (samples, length)', np.shape(X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) print 'Baum Welch return:', ret if np.isnan(ret): return 'Failure' [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) #--------------- learning for anomaly detection ---------------------------- [A, B, pi] = self.ml.asMatrices() n, m = np.shape(X1) self.nGaussian = self.nState # Get average loglikelihood threshold wrt progress self.std_coff = 1.0 g_mu_list = np.linspace(0, m - 1, self.nGaussian) #, dtype=np.dtype(np.int16)) g_sig = float(m) / float(self.nGaussian) * self.std_coff # print 'g_mu_list:', g_mu_list # print 'g_sig:', g_sig ###################################################################################### if os.path.isfile(ml_pkl) and use_pkl: with open(ml_pkl, 'rb') as f: d = pickle.load(f) self.l_statePosterior = d[ 'state_post'] # time x state division self.ll_mu = d['ll_mu'] self.ll_std = d['ll_std'] else: if self.cluster_type == 'time': print 'Begining parallel job' r = Parallel(n_jobs=-1)(delayed(learn_likelihoods_progress)( i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim, g_mu_list[i], g_sig, self.nState) for i in xrange(self.nGaussian)) # r = [self.learn_likelihoods_progress_par(i, n, m, A, B, pi, X_train, g_mu_list[i], g_sig) for i in xrange(self.nGaussian)] print 'Completed parallel job' l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r) elif self.cluster_type == 'state': self.km = None self.ll_mu = None self.ll_std = None self.ll_mu, self.ll_std = self.state_clustering(X1) path_mat = np.zeros((self.nState, m * n)) likelihood_mat = np.zeros((1, m * n)) self.l_statePosterior = None d = dict() d['state_post'] = self.l_statePosterior d['ll_mu'] = self.ll_mu d['ll_std'] = self.ll_std ut.save_pickle(d, ml_pkl)
def fit(self, xData1, xData2, xData3, A=None, B=None, pi=None, cov_mult=[100.0] * 9, verbose=False, ml_pkl='ml_temp.pkl', use_pkl=False): ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl) X1 = np.array(xData1) X2 = np.array(xData2) X3 = np.array(xData3) if A is None: if verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = self.init_trans_mat(self.nState).tolist() # print 'A', A if B is None: if verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mu1, mu2, mu3, cov = self.vectors_to_mean_cov( X1, X2, X3, self.nState) cov[:, 0, 0] *= cov_mult[0] #1.5 # to avoid No convergence warning cov[:, 1, 0] *= cov_mult[1] #5.5 # to avoid No convergence warning cov[:, 2, 0] *= cov_mult[2] cov[:, 0, 1] *= cov_mult[3] cov[:, 1, 1] *= cov_mult[4] cov[:, 2, 1] *= cov_mult[5] cov[:, 0, 2] *= cov_mult[6] cov[:, 1, 2] *= cov_mult[7] cov[:, 2, 2] *= cov_mult[8] print 'mu1:', mu1 print 'mu2:', mu2 print 'mu3:', mu3 print 'cov', cov # Emission probability matrix B = [0.0] * self.nState for i in range(self.nState): B[i] = [[mu1[i], mu2[i], mu3[i]], [ cov[i, 0, 0], cov[i, 0, 1], cov[i, 0, 2], cov[i, 1, 0], cov[i, 1, 1], cov[i, 1, 2], cov[i, 2, 0], cov[i, 2, 1], cov[i, 2, 2] ]] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # HMM model object self.ml = ghmm.HMMFromMatrices( self.F, ghmm.MultivariateGaussianDistribution(self.F), A, B, pi) X_train = self.convert_sequence(X1, X2, X3) # Training input X_train = X_train.tolist() print 'Run Baum Welch method with (samples, length)', np.shape(X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) print 'Baum Welch return:', ret [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) # print 'B\'s shape:', self.B.shape, self.B[0].shape, self.B[1].shape # print B[0] # print B[1] #--------------- learning for anomaly detection ---------------------------- [A, B, pi] = self.ml.asMatrices() n, m = np.shape(X1) self.nGaussian = self.nState # Get average loglikelihood threshold wrt progress self.std_coff = 1.0 g_mu_list = np.linspace(0, m - 1, self.nGaussian) #, dtype=np.dtype(np.int16)) g_sig = float(m) / float(self.nGaussian) * self.std_coff print 'g_mu_list:', g_mu_list print 'g_sig:', g_sig ###################################################################################### if os.path.isfile(ml_pkl) and use_pkl: with open(ml_pkl, 'rb') as f: d = pickle.load(f) self.l_statePosterior = d[ 'state_post'] # time x state division self.ll_mu = d['ll_mu'] self.ll_std = d['ll_std'] else: n_jobs = -1 print 'Begining parallel job' r = Parallel(n_jobs=n_jobs)(delayed(learn_likelihoods_progress)( i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim, g_mu_list[i], g_sig, self.nState) for i in xrange(self.nGaussian)) print 'Completed parallel job' l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r) d = dict() d['state_post'] = self.l_statePosterior d['ll_mu'] = self.ll_mu d['ll_std'] = self.ll_std with open(ml_pkl, 'wb') as f: pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
[[1.5, -1.0], [1.5, 1.5], [0.75, 0.25]], [[1.5, -1.0], [1.5, 1.5], [0.5, 0.5]], [[1.5, -1.0], [1.5, 1.5], [0.25, 0.75]]] # [p1_mean, p2,mean], [p1_std, p2_std], [P(p1), P(p2)] model = ghmm.HMMFromMatrices(F, ghmm.GaussianMixtureDistribution(F), Transitionmatrix, Emissionmatrix, pi) else: Emissionmatrix = [[params['bound'] * 100.0, 1.0], [2.0, 0.5], [1.0, 0.5], [-1.0, 0.5], [-2.0, 0.5]] # [mean, std] model = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), Transitionmatrix, Emissionmatrix, pi) print('Model before training:') print(model) mghmm_train = ghmm.SequenceSet(F, train_set) model.baumWelch(mghmm_train, 10000, 0.01) print('Model after training:') print(model) model.write(out_hmm) ###------------------------------------------------ ###------------------------------------------------ # calculate tail length using the mghmm model and write them to output files # dict_tl structure: {gene_name : [list of tail lengths]} pwrite(f_log, '\nCalculating tail-lengths and writing outputs...' + timer()) lst_tl = [] # for storing gene_name, tail-length pairs counting = 0 rounds = 0 counting_sum = 0
def createSequenceSet(self, qtc, symbols): return gh.SequenceSet(symbols, qtc)
def train(self, hmm, sset): sset = [array_flatten(s) for s in sset] hmm_ = self._get_hmm(hmm) hmm_.baumWelch(ghmm.SequenceSet(DOMAIN, sset)) hmm.A, hmm.B, hmm.pi = hmm_.asMatrices()
def _sequence_set_from_list(l): # Conversion is similar to _sequence_from_data but here data is a list. unrolled = [matrix.ravel().tolist() for matrix in l] seq = impl.SequenceSet(impl.Float(), unrolled) return seq
# %% import ghmm # %% sigma = ghmm.IntegerRange(1, 7) train_seq = ghmm.SequenceSet( sigma, [[1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1]]) A = [[0.99, 0.01], [0.99, 0.01]] B = [[1.0 / 6] * 6] * 2 pi = [0.5] * 2 m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, pi) m.baumWelch(train_seq, 100000000, 0.000000000000001) print(m.asMatrices()) # %% print(map(sigma.external, m.sampleSingle(20))) # %% v = m.viterbi(test_seq) print v # %% my_seq = ghmm.EmissionSequence(sigma, [1] * 20 + [6] * 10 + [1] * 40) print m.viterbi(my_seq)
def extractHMMFeatures(sourceFiles): """ Extracts HMM-similarity features from all files in a given directory """ allTraces = [] # List to store all traces for the HMM-similarity extraction try: for targetFile in sourceFiles: if os.path.exists(targetFile.replace(".c", ".seq")): instructionAlphaSequence = open(targetFile.replace(".c", ".seq")).read() allTraces.append( (instructionAlphaSequence, targetFile.replace(".c", ".hmm"), loadLabelFromFile(targetFile.replace(".c", ".metadata"))[0])) #TODO: Append a tuple of (trace, filename, cluster) for each data sample if len(allTraces) < 1: prettyPrint("No traces to process for HMM-similarity feature extraction. Skipping", "warning") else: allClusters = [] # Retrieve list of clusters prettyPrint("Retrieving clusters") for trace in allTraces: if not trace[2] in allClusters: allClusters.append(trace[2]) # Gather traces belonging to different clusters clusterTraces = [] for cluster in allClusters: currentCluster = [] for trace in allTraces: if trace[2] == cluster: currentCluster.append(trace[0]) clusterTraces.append(currentCluster) prettyPrint("Retrieved %s instances for cluster %s" % (len(currentCluster), cluster)) # Should wind up with list of lists each of which depict traces of a cluster allHMMs = [] for cluster in allClusters: # Build HMM for each cluster and use it to calculate likelihoods for all instances prettyPrint("Building HMM for cluster \"%s\"" % cluster) trainingSequences = clusterTraces[ allClusters.index(cluster) ] # Retrieve number of observations observations = [] for sequence in trainingSequences: for o in sequence: if o not in observations: observations.append(o) # Prepare matrices for HMM A = numpy.random.random((len(allClusters), len(allClusters))).tolist() B = numpy.random.random((len(allClusters), len(observations))).tolist() Pi = numpy.random.random((len(allClusters),)).tolist() sigma = ghmm.Alphabet(observations) # Build HMM and train it using Baum-Welch algorithm clusterHMM = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, Pi) clusterHMM.baumWelch(ghmm.SequenceSet(clusterHMM.emissionDomain, trainingSequences)) # Add that to list of all HMM's allHMMs.append((clusterHMM, observations)) # Finally, for every trace, calculate the feature vectors prettyPrint("Calculating similarity features for traces") for trace in allTraces: featureVector = [] for hmm in allHMMs: # Make sure sequences contains observations supported by the current HMM sequence = [] for obs in trace[0]: if obs in hmm[1]: sequence.append(obs) # Calculate the likelihood sequence = ghmm.EmissionSequence(ghmm.Alphabet(hmm[1]), sequence) featureVector.append(hmm[0].loglikelihood(sequence)) featureFile = open(trace[1], "w") featureFile.write(str(featureVector)) featureFile.close() ############################################################################# except Exception as e: prettyPrint("Error encoutered: %s" % e, "error") return False return True
# Emission Probabilities B = calculate_emission_probabilities(train_data, n_components, vocab_len) # Initial State Distribution pi = [ 1.0 / n_components ] * n_components # Equally distribute the starting probabilities m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, pi) m.baumWelch( ghmm.SequenceSet(sigma, train_data), nrSteps=1000, loglikelihoodCutoff=0.00005 ) # Defaults: nrSteps=500, loglikelihoodCutoff=0.0001 # print('Training Done') # print(m.asMatrices()[0]) # print(m.asMatrices()[1]) # print(m.asMatrices()[2]) total_checked = 0 total_correct = 0 threshold_checked = 0 threshold_correct = 0
def fit(self, xData, A=None, B=None, pi=None, cov_mult=None, ml_pkl=None, use_pkl=False, cov_type='full', fixed_trans=0,\ shuffle=False): ''' Input : - xData: dimension x sample x length Issues: - If NaN is returned, the reason can be one of followings, -- lower cov -- small range of xData (you have to scale it up.) ''' # Daehyung: What is the shape and type of input data? if shuffle: X = xData X = np.swapaxes(X, 0, 1) id_list = range(len(X)) random.shuffle(id_list) X = np.array(X)[id_list] X = np.swapaxes(X, 0, 1) else: X = [np.array(data) for data in xData] nData = len(xData[0]) param_dict = {} # Load pre-trained HMM without training if use_pkl and ml_pkl is not None and os.path.isfile(ml_pkl): if self.verbose: print "Load HMM parameters without train the hmm" param_dict = ut.load_pickle(ml_pkl) self.A = param_dict['A'] self.B = param_dict['B'] self.pi = param_dict['pi'] if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ self.A, self.B, self.pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ self.A, self.B, self.pi) out_a_num = param_dict.get('out_a_num', None) vec_num = param_dict.get('vec_num', None) mat_num = param_dict.get('mat_num', None) u_denom = param_dict.get('u_denom', None) if out_a_num is not None: self.ml.setBaumWelchParams(out_a_num, vec_num, mat_num, u_denom) return True else: if ml_pkl is None: ml_pkl = os.path.join(os.path.dirname(__file__), 'ml_temp_n.pkl') if cov_mult is None: cov_mult = [1.0] * (self.nEmissionDim**2) if A is None: if self.verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = util.init_trans_mat(self.nState).tolist() if B is None: if self.verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mus, cov = util.vectors_to_mean_cov(X, self.nState, self.nEmissionDim, cov_type=cov_type) ## print np.shape(mus), np.shape(cov) # cov: state x dim x dim for i in xrange(self.nEmissionDim): for j in xrange(self.nEmissionDim): cov[:, i, j] *= cov_mult[self.nEmissionDim * i + j] if self.verbose: for i, mu in enumerate(mus): print 'mu%i' % i, mu ## print 'cov', cov # Emission probability matrix B = [0] * self.nState for i in range(self.nState): if self.nEmissionDim > 1: B[i] = [[mu[i] for mu in mus]] B[i].append(cov[i].flatten().tolist()) else: B[i] = [np.squeeze(mus[0][i]), float(cov[i])] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # print 'Generating HMM' # HMM model object if self.nEmissionDim == 1: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \ A, B, pi) else: self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \ A, B, pi) if cov_type == 'diag': self.ml.setDiagonalCovariance(1) # print 'Creating Training Data' X_train = util.convert_sequence(X) # Training input X_train = X_train.tolist() if self.verbose: print "training data size: ", np.shape(X_train) if self.verbose: print 'Run Baum Welch method with (samples, length)', np.shape( X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) #, fixedTrans=fixed_trans) if np.isnan(ret): print 'Baum Welch return:', ret return 'Failure' print 'Baum Welch return:', ret / float(nData) [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) param_dict['A'] = self.A param_dict['B'] = self.B param_dict['pi'] = self.pi try: [out_a_num, vec_num, mat_num, u_denom] = self.ml.getBaumWelchParams() param_dict['out_a_num'] = out_a_num param_dict['vec_num'] = vec_num param_dict['mat_num'] = mat_num param_dict['u_denom'] = u_denom except: print "Install new ghmm!!" if ml_pkl is not None: ut.save_pickle(param_dict, ml_pkl) return ret / float(nData)
def fit(self, xData, A=None, B=None, pi=None, cov_mult=None, ml_pkl=None, use_pkl=False): if ml_pkl is None: ml_pkl = os.path.join(os.path.dirname(__file__), 'ml_temp_n.pkl') if cov_mult is None: cov_mult = [1.0]*(self.nEmissionDim**2) # Daehyung: What is the shape and type of input data? X = [np.array(data) for data in xData] if A is None: if self.verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = self.init_trans_mat(self.nState).tolist() if B is None: if self.verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mus, cov = self.vectors_to_mean_cov(X, self.nState) for i in xrange(self.nEmissionDim): for j in xrange(self.nEmissionDim): cov[:, j, i] *= cov_mult[self.nEmissionDim*i + j] if self.verbose: for i, mu in enumerate(mus): print 'mu%i' % i, mu print 'cov', cov # Emission probability matrix B = [0] * self.nState for i in range(self.nState): B[i] = [[mu[i] for mu in mus]] B[i].append(cov[i].flatten()) if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # print 'Generating HMM' # HMM model object self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), A, B, pi) # print 'Creating Training Data' X_train = self.convert_sequence(X) # Training input X_train = X_train.tolist() if self.verbose: print 'Run Baum Welch method with (samples, length)', np.shape(X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) print 'Baum Welch return:', ret if np.isnan(ret): return 'Failure' [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) #--------------- learning for anomaly detection ---------------------------- [A, B, pi] = self.ml.asMatrices() n, m = np.shape(X[0]) self.nGaussian = self.nState if self.check_method == 'change' or self.check_method == 'globalChange': # Get maximum change of loglikelihood over whole time ll_delta_logp = [] for j in xrange(n): l_logp = [] for k in xrange(1, m): final_ts_obj = ghmm.EmissionSequence(self.F, X_train[j][:k*self.nEmissionDim]) logp = self.ml.loglikelihoods(final_ts_obj)[0] l_logp.append(logp) l_delta_logp = np.array(l_logp[1:]) - np.array(l_logp[:-1]) ll_delta_logp.append(l_delta_logp) self.l_mean_delta = np.mean(abs(np.array(ll_delta_logp).flatten())) self.l_std_delta = np.std(abs(np.array(ll_delta_logp).flatten())) if self.verbose: print "mean_delta: ", self.l_mean_delta, " std_delta: ", self.l_std_delta if self.check_method == 'global' or self.check_method == 'globalChange': # Get average loglikelihood threshold over whole time l_logp = [] for j in xrange(n): for k in xrange(1, m): final_ts_obj = ghmm.EmissionSequence(self.F, X_train[j][:k*self.nEmissionDim]) logp = self.ml.loglikelihoods(final_ts_obj)[0] l_logp.append(logp) self.l_mu = np.mean(l_logp) self.l_std = np.std(l_logp) elif self.check_method == 'progress': # Get average loglikelihood threshold wrt progress if os.path.isfile(ml_pkl) and use_pkl: if self.verbose: print 'Load detector parameters' d = ut.load_pickle(ml_pkl) self.l_statePosterior = d['state_post'] # time x state division self.ll_mu = d['ll_mu'] self.ll_std = d['ll_std'] else: if self.cluster_type == 'time': if self.verbose: print 'Begining parallel job' self.std_coff = 1.0 g_mu_list = np.linspace(0, m-1, self.nGaussian) #, dtype=np.dtype(np.int16)) g_sig = float(m) / float(self.nGaussian) * self.std_coff r = Parallel(n_jobs=-1)(delayed(learn_likelihoods_progress)(i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim, g_mu_list[i], g_sig, self.nState) for i in xrange(self.nGaussian)) if self.verbose: print 'Completed parallel job' l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r) elif self.cluster_type == 'state': self.km = None self.ll_mu = None self.ll_std = None self.ll_mu, self.ll_std = self.state_clustering(X) path_mat = np.zeros((self.nState, m*n)) likelihood_mat = np.zeros((1, m*n)) self.l_statePosterior=None d = dict() d['state_post'] = self.l_statePosterior d['ll_mu'] = self.ll_mu d['ll_std'] = self.ll_std ut.save_pickle(d, ml_pkl)
def fit(self, xData1, xData2=None, A=None, B=None, pi=None, cov_mult=(1.0, 1.0, 1.0, 1.0), verbose=False, ml_pkl='ml_temp.pkl', use_pkl=False): ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl) X1 = np.array(xData1) X2 = np.array(xData2) if A is None: if verbose: print "Generating a new A matrix" # Transition probability matrix (Initial transition probability, TODO?) A = self.init_trans_mat(self.nState).tolist() if B is None: if verbose: print "Generating a new B matrix" # We should think about multivariate Gaussian pdf. mu1, mu2, cov = self.vectors_to_mean_cov(X1, X2, self.nState) cov[:, 0, 0] *= cov_mult[0] #1.5 # to avoid No convergence warning cov[:, 1, 0] *= cov_mult[1] #5.5 # to avoid No convergence warning cov[:, 0, 1] *= cov_mult[2] #5.5 # to avoid No convergence warning cov[:, 1, 1] *= cov_mult[3] #5.5 # to avoid No convergence warning # Emission probability matrix B = [0.0] * self.nState for i in range(self.nState): B[i] = [[mu1[i], mu2[i]], [ cov[i, 0, 0], cov[i, 0, 1], cov[i, 1, 0], cov[i, 1, 1] ]] if pi is None: # pi - initial probabilities per state ## pi = [1.0/float(self.nState)] * self.nState pi = [0.0] * self.nState pi[0] = 1.0 # HMM model object self.ml = ghmm.HMMFromMatrices( self.F, ghmm.MultivariateGaussianDistribution(self.F), A, B, pi) X_train = self.convert_sequence(X1, X2) # Training input X_train = X_train.tolist() print 'Run Baum Welch method with (samples, length)', np.shape(X_train) final_seq = ghmm.SequenceSet(self.F, X_train) ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0) ret = self.ml.baumWelch(final_seq, 10000) print 'Baum Welch return:', ret [self.A, self.B, self.pi] = self.ml.asMatrices() self.A = np.array(self.A) self.B = np.array(self.B) #--------------- learning for anomaly detection ---------------------------- [A, B, pi] = self.ml.asMatrices() n, m = np.shape(X1) self.nGaussian = self.nState if self.check_method == 'change' or self.check_method == 'globalChange': # Get maximum change of loglikelihood over whole time ll_delta_logp = [] for j in xrange(n): l_logp = [] for k in xrange(1, m): final_ts_obj = ghmm.EmissionSequence( self.F, X_train[j][:k * self.nEmissionDim]) logp = self.ml.loglikelihoods(final_ts_obj)[0] l_logp.append(logp) l_delta_logp = np.array(l_logp[1:]) - np.array(l_logp[:-1]) ll_delta_logp.append(l_delta_logp) self.l_mean_delta = np.mean(abs(np.array(ll_delta_logp).flatten())) self.l_std_delta = np.std(abs(np.array(ll_delta_logp).flatten())) print "mean_delta: ", self.l_mean_delta, " std_delta: ", self.l_std_delta if self.check_method == 'global' or self.check_method == 'globalChange': # Get average loglikelihood threshold over whole time l_logp = [] for j in xrange(n): for k in xrange(1, m): final_ts_obj = ghmm.EmissionSequence( self.F, X_train[j][:k * self.nEmissionDim]) logp = self.ml.loglikelihoods(final_ts_obj)[0] l_logp.append(logp) self.l_mu = np.mean(l_logp) self.l_std = np.std(l_logp) elif self.check_method == 'progress': # Get average loglikelihood threshold wrt progress self.std_coff = 1.0 g_mu_list = np.linspace( 0, m - 1, self.nGaussian) #, dtype=np.dtype(np.int16)) g_sig = float(m) / float(self.nGaussian) * self.std_coff ###################################################################################### if os.path.isfile(ml_pkl) and use_pkl: with open(ml_pkl, 'rb') as f: d = pickle.load(f) self.l_statePosterior = d[ 'state_post'] # time x state division self.ll_mu = d['ll_mu'] self.ll_std = d['ll_std'] else: n_jobs = -1 r = Parallel(n_jobs=n_jobs)( delayed(learn_likelihoods_progress)( i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim, g_mu_list[i], g_sig, self.nState) for i in xrange(self.nGaussian)) l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r) d = dict() d['state_post'] = self.l_statePosterior d['ll_mu'] = self.ll_mu d['ll_std'] = self.ll_std with open(ml_pkl, 'wb') as f: pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
def get_seqs_set(self, obs_seqs): obs_seqs = self.get_obs_seqs(obs_seqs) obs_seqs_set = ghmm.SequenceSet(self.emission_domain, obs_seqs) return obs_seqs_set