def _train(self, seq, trans, emi, num_possible_states, pseudo_transitions=False, start_at_zero=False): """Uses the given parameters to train a multinominal HMM to represent the given seqences of observations. Uses Baum-Welch training. Please override if special training is necessary for your QSR. :param seq: the sequence of observations represented by alphabet symbols :param trans: the transition matrix as a numpy array :param emi: the emission matrix as a numpy array :param num_possible_states: the total number of possible states :return: the via baum-welch training generated hmm """ print 'Generating HMM:' print seq print '\tCreating symbols...' symbols = self.generate_alphabet(num_possible_states) if start_at_zero: startprob = np.zeros(num_possible_states) startprob[0] = 1 else: startprob = np.ones(num_possible_states) startprob = startprob / np.sum(startprob) print startprob print '\t\t', symbols print '\tCreating HMM...' hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols), trans.tolist(), emi.tolist(), startprob.tolist()) print '\tTraining...' hmm.baumWelch(self._create_sequence_set(seq, symbols)) if pseudo_transitions: print '\tAdding pseudo transitions...' pseudo = deepcopy(trans) pseudo[pseudo > 0.] = 1. pseudo = pseudo / (float(len(seq) + 1)) trans_trained, emi, start = hmm.asMatrices() trans_trained = np.array(trans_trained) + pseudo hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols), trans_trained.tolist(), emi, start) hmm.normalize() return hmm
def __init__(self, preprocess_args, metric, graph_structure_type, A, B, pi, obs_bins, win_len, thresh, min_peak_dist): """ Args: preprocess_args: metric: graph_structure_type: "predefined", "fully", "left_to_right" A: initial hidden states graph B: initial hidden states distribution pi: initial hidden states probabilities obs_bins: bins used in the hidden states distribution win_len: windows lengths of the sliding window offline thresh: in the peak detection, detect peaks that are greater than thresh min_peak_dist: in the peak detection, detect peaks that are at least separated by minimum peak distance """ self.preprocess_args = preprocess_args self.metric = metric self.graph_structure_type = graph_structure_type self.A = A self.B = B self.pi = pi self.obs_bins = obs_bins self.win_len = win_len self.thresh = thresh self.min_peak_dist = min_peak_dist m = len(self.B[0]) # num of symbols self.emission_domain = ghmm.IntegerRange(0, m) self.emission_distr = ghmm.DiscreteDistribution(self.emission_domain)
def _predict_next(self): """@todo: Docstring for _predict_next. :returns: @todo """ a_init = normalize_stoch_map(np.random.rand(self._n_hid, self._n_hid)) b_init = normalize_stoch_map( np.random.rand(self._n_hid, self._n_sym**2)) pi_init = normalize_stoch_map(np.random.rand(self._n_hid)) hmm = gh.HMMFromMatrices(self._alphab, gh.DiscreteDistribution(self._alphab), a_init, b_init, pi_init) obs = gh.EmissionSequence(self._alphab, self._memory) hmm.baumWelch(obs) alpha = hmm.forward(obs)[0][-1] trans = hmm.asMatrices()[0] alpha = np.dot(alpha, trans) next_moves_dist = np.zeros(self._n_sym**2) for i in range(self._n_hid): next_moves_dist += np.asarray(hmm.getEmission(i)) * alpha[i] next_moves_dist = next_moves_dist[self._conversion_array] next_move = np.argmax(np.sum(next_moves_dist, axis=0)) return np.where(self._rules[next_move] == -1)[0][0]
def _train(self, seq, trans, emi, num_possible_states): """Uses the given parameters to train a multinominal HMM to represent the given seqences of observations. Uses Baum-Welch training. Please override if special training is necessary for your QSR. :param seq: the sequence of observations represented by alphabet symbols :param trans: the transition matrix as a numpy array :param emi: the emission matrix as a numpy array :param num_possible_states: the total number of possible states :return: the via baum-welch training generated hmm """ print 'Generating HMM:' print '\tCreating symbols...' symbols = self._generate_alphabet(num_possible_states) startprob = np.zeros(num_possible_states) startprob[0] = 1 print '\t\t', symbols print '\tCreating HMM...' hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols), trans.tolist(), emi.tolist(), startprob.tolist()) print '\tTraining...' hmm.baumWelch(self._create_sequence_set(seq, symbols)) return hmm
def trainHMM(self, seq, trans, emi, qtc_type='qtcc'): """Uses the given parameters to train a multinominal HMM to represent the given seqences""" if qtc_type == 'qtcb': state_num = 11 elif qtc_type == 'qtcc': state_num = 83 elif qtc_type == 'qtcbc': state_num = 92 else: raise (QtcException( "trainHMM: Unknow qtc type: {!r}".format(qtc_type))) print 'Generating HMM:' print '\tCreating symbols...' symbols = self.generateAlphabet(state_num) startprob = np.zeros((state_num)) startprob[0] = 1 print '\t\t', symbols print '\tCreating HMM...' qtc_hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols), trans.tolist(), emi.tolist(), startprob.tolist()) print '\tTraining...' qtc_hmm.baumWelch(self.createSequenceSet(seq, symbols)) return qtc_hmm
def train(self): # This tells GHMM every possible value that it will be seeing alphabet = ghmm.Alphabet(list(set(self.events))) alphaLen = len(alphabet) # Initiaize the probabilities of transitioning from each state to each other # state. There is probably a better way to do this, but this is nice and simple. trans_prob = 1.0 / (alphaLen) trans = [[trans_prob for row in range(alphaLen)] for col in range(alphaLen)] # Initialize the probabilities of seeing each output from each state. # Again, there is probably a better way to do this, but this is simple. emiss_prob = 1.0 / (alphaLen) emiss = [[emiss_prob for row in range(alphaLen)] for col in range(alphaLen)] # Some grease to get GHMM to work pi = [1.0 / alphaLen] * alphaLen # The sequence of musical events gathered from the music train_seq = ghmm.EmissionSequence(alphabet, self.events) # Generate the model of the data m = ghmm.HMMFromMatrices(alphabet, ghmm.DiscreteDistribution(alphabet), trans, emiss, pi) # Train the model based on the training sequence m.baumWelch(train_seq) return (m, alphabet)
def create_0order_hmm(nb_seq, nb_residues, first_letters, motif): """ Create a 0-order HMM initialized from MEME result :arg nb_seq: Number of sequences used by MEME :type nb_seq: int :arg nb_residues: Number of residues used by MEME :type nb_residues: int :arg first_letters: Number of occurrences of ACGT at the begining of sequences used by MEME :type first_letters: dic of str->int :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM :type motif: :class:`Bio.motifs` :returns: The constructed HMM :rtype: :class:`ghmm.DiscreteEmissionHMM` """ # The first state is random emissions = [[0.25, 0.25, 0.25, 0.25]] # Complete the emissions with the actual motif frequencies if motif.instances: # The motif.counts is computed directly when creating the motif from # instances nb_hits = len(motif.instances) else: nb_hits = nb_seq for position in xrange(len(motif)): frequencies = [] for letter in "ACGT": freq = (motif.counts[letter][position] + 1.) / (nb_hits + 4.) frequencies.append(freq) emissions.append(frequencies) # Background transitions transitions = [] background_to_background = 1. - float(nb_seq) / nb_residues background_to_foreground = 1. - background_to_background transitions.append( [background_to_background, background_to_foreground] + [0.] * (len(motif) - 1)) # Core transitions for position in xrange(1, len(motif)): transitions.append( [0.] * (position + 1) + [1.] + [0.] * (len(motif) - position - 1)) # Final transitions now transitions.append([1.] + [0.] * len(motif)) # Starting proba initials = [1.] + [0.] * len(motif) return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET), ghmm.DiscreteDistribution( ghmm.Alphabet(ALPHABET)), transitions, emissions, initials)
def setUp(self): '''Create a simple dice rolling HMM''' self.sigma = g.IntegerRange(1, 7) self.A = [[0.9, 0.1], [0.3, 0.7]] efair = [1.0 / 6] * 6 eloaded = [3.0 / 13, 3.0 / 13, 2.0 / 13, 2.0 / 13, 2.0 / 13, 1.0 / 13] self.B = [efair, eloaded] self.pi = [0.5] * 2 self.m = g.HMMFromMatrices(self.sigma, g.DiscreteDistribution(self.sigma), self.A, self.B, self.pi)
def ghmm_from_discrete_hmm(hmm): hmm = deepcopy(hmm) domain = ghmm.Alphabet(range(hmm.alphabetSize)) trans = hmm.transitionMatrix init = hmm.initialProbabilities emissions = [d.probabilities for d in hmm.emissionDistributions] return ghmm.HMMFromMatrices(emissionDomain=domain, distribution=ghmm.DiscreteDistribution(domain), A=trans, B=emissions, pi=init)
def __init__(self, A, B, Pi, observations): if len(A) == len(Pi): self.states = range(len(A)) self.sigma = ghmm.Alphabet(observations) # The "alphabet" comprising action indices self.initA = A self.initB = B self.initPi = Pi self.ghmmModel = ghmm.HMMFromMatrices(self.sigma, ghmm.DiscreteDistribution(self.sigma), self.initA, self.initB, self.initPi) else: prettyPrint("Unable to initialize model. Unequal number of states", "error") return
def finalize(self): cmodel = self.HMM.finalize() if (self.modeltype & ghmmwrapper.kContinuousHMM): return ghmm.ContinuousMixtureHMM( ghmm.Float(), ghmm.ContinuousMixtureDistribution(ghmm.Float()), cmodel) elif ((self.modeltype & ghmmwrapper.kDiscreteHMM) and not (self.modeltype & ghmmwrapper.kTransitionClasses) and not (self.modeltype & ghmmwrapper.kPairHMM)): emission_domain = ghmm.Alphabet([], cmodel.alphabet) if (self.modeltype & ghmmwrapper.kLabeledStates): labelDomain = ghmm.LabelDomain([], cmodel.label_alphabet) return ghmm.StateLabelHMM( emission_domain, ghmm.DiscreteDistribution(emission_domain), labelDomain, cmodel) else: return ghmm.DiscreteEmissionHMM( emission_domain, ghmm.DiscreteDistribution(emission_domain), cmodel)
def trainHMM(hmmState): ''' Train HMM with the given chromosome. ''' print >> sys.stderr, printTime(), "Train HMM with one chromosome." T = [[0.9, 0.1], [0.1, 0.9]] e1 = [0.1, 0.9] e0 = [0.9, 0.1] E = [e0, e1] pi = [0.9, 0.1] # initial 10% are peak? sigma = ghmm.IntegerRange(0, 2) # 0, 1 m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), T, E, pi) m.baumWelch(ghmm.EmissionSequence(sigma, hmmState)) print >> sys.stderr, printTime(), "Train HMM finished." print >> sys.stderr return m
def __create_hmm_from_dict(self, dictionary, qsr_type, num_symbols): """Creates a hmm from the xml representation. Not nice to use tempfile but not otherwise possible due to hidden code and swig in ghmm. :param xml: The xml string :return: the ghmm hmm object """ symbols = self.hmm_types_available[qsr_type]().generate_alphabet( num_symbols) hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols), dictionary[self.TRANS], dictionary[self.EMI], dictionary[self.START]) return hmm
def train_model(songs_data): """Input: list of data on several songs (could be a single song) Ouput: a list of models, one for each bar type. """ note_models = {} notes = get_notes(songs_data) # This tells GHMM every possible value that it will be seeing note_alphabet = ghmm.Alphabet(list(set(notes))) note_alpha_len = len(note_alphabet) # Initiaize the probabilities of transitioning from each state to each other # state. There is probably a better way to do this, but this is nice and simple. note_trans_prob = 1.0 / (note_alpha_len) trans = [[note_trans_prob for row in range(note_alpha_len)] for col in range(note_alpha_len)] # Initialize the probabilities of seeing each output from each state. # Again, there is probably a better way to do this, but this is simple. note_emiss_prob = 1.0 / (note_alpha_len) emiss = [[note_emiss_prob for row in range(note_alpha_len)] for col in range(note_alpha_len)] # Some grease to get GHMM to work pi = [1.0 / note_alpha_len] * note_alpha_len # The sequence of notes gathered from the music note_train_seq = ghmm.EmissionSequence(note_alphabet, notes) bars = BarLearner.get_bars(songs_data) for bar in set(bars): # Generate the model of the data note_models[bar] = ghmm.HMMFromMatrices( note_alphabet, ghmm.DiscreteDistribution(note_alphabet), trans, emiss, pi) for bar in bars: # Train the model based on the training sequence note_models[bar].baumWelch(note_train_seq) return (note_models, note_alphabet)
# %% import ghmm # %% sigma = ghmm.IntegerRange(1, 7) train_seq = ghmm.SequenceSet( sigma, [[1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1]]) A = [[0.99, 0.01], [0.99, 0.01]] B = [[1.0 / 6] * 6] * 2 pi = [0.5] * 2 m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, pi) m.baumWelch(train_seq, 100000000, 0.000000000000001) print(m.asMatrices()) # %% print(map(sigma.external, m.sampleSingle(20))) # %% v = m.viterbi(test_seq) print v # %% my_seq = ghmm.EmissionSequence(sigma, [1] * 20 + [6] * 10 + [1] * 40) print m.viterbi(my_seq)
def extractHMMFeatures(sourceFiles): """ Extracts HMM-similarity features from all files in a given directory """ allTraces = [] # List to store all traces for the HMM-similarity extraction try: for targetFile in sourceFiles: if os.path.exists(targetFile.replace(".c", ".seq")): instructionAlphaSequence = open(targetFile.replace(".c", ".seq")).read() allTraces.append( (instructionAlphaSequence, targetFile.replace(".c", ".hmm"), loadLabelFromFile(targetFile.replace(".c", ".metadata"))[0])) #TODO: Append a tuple of (trace, filename, cluster) for each data sample if len(allTraces) < 1: prettyPrint("No traces to process for HMM-similarity feature extraction. Skipping", "warning") else: allClusters = [] # Retrieve list of clusters prettyPrint("Retrieving clusters") for trace in allTraces: if not trace[2] in allClusters: allClusters.append(trace[2]) # Gather traces belonging to different clusters clusterTraces = [] for cluster in allClusters: currentCluster = [] for trace in allTraces: if trace[2] == cluster: currentCluster.append(trace[0]) clusterTraces.append(currentCluster) prettyPrint("Retrieved %s instances for cluster %s" % (len(currentCluster), cluster)) # Should wind up with list of lists each of which depict traces of a cluster allHMMs = [] for cluster in allClusters: # Build HMM for each cluster and use it to calculate likelihoods for all instances prettyPrint("Building HMM for cluster \"%s\"" % cluster) trainingSequences = clusterTraces[ allClusters.index(cluster) ] # Retrieve number of observations observations = [] for sequence in trainingSequences: for o in sequence: if o not in observations: observations.append(o) # Prepare matrices for HMM A = numpy.random.random((len(allClusters), len(allClusters))).tolist() B = numpy.random.random((len(allClusters), len(observations))).tolist() Pi = numpy.random.random((len(allClusters),)).tolist() sigma = ghmm.Alphabet(observations) # Build HMM and train it using Baum-Welch algorithm clusterHMM = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, Pi) clusterHMM.baumWelch(ghmm.SequenceSet(clusterHMM.emissionDomain, trainingSequences)) # Add that to list of all HMM's allHMMs.append((clusterHMM, observations)) # Finally, for every trace, calculate the feature vectors prettyPrint("Calculating similarity features for traces") for trace in allTraces: featureVector = [] for hmm in allHMMs: # Make sure sequences contains observations supported by the current HMM sequence = [] for obs in trace[0]: if obs in hmm[1]: sequence.append(obs) # Calculate the likelihood sequence = ghmm.EmissionSequence(ghmm.Alphabet(hmm[1]), sequence) featureVector.append(hmm[0].loglikelihood(sequence)) featureFile = open(trace[1], "w") featureFile.write(str(featureVector)) featureFile.close() ############################################################################# except Exception as e: prettyPrint("Error encoutered: %s" % e, "error") return False return True
sigma = ghmm.IntegerRange(0, vocab_len) # Emission range # Transition Matrix A = calculate_transition_probabilities(n_components) # Emission Probabilities B = calculate_emission_probabilities(train_data, n_components, vocab_len) # Initial State Distribution pi = [ 1.0 / n_components ] * n_components # Equally distribute the starting probabilities m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, pi) m.baumWelch( ghmm.SequenceSet(sigma, train_data), nrSteps=1000, loglikelihoodCutoff=0.00005 ) # Defaults: nrSteps=500, loglikelihoodCutoff=0.0001 # print('Training Done') # print(m.asMatrices()[0]) # print(m.asMatrices()[1]) # print(m.asMatrices()[2]) total_checked = 0 total_correct = 0
def create_detailed_hmm(nb_seq, nb_residues, first_letters, motif): """ Create a detailed HMM initialized from MEME result :arg nb_seq: Number of sequences used by MEME :type nb_seq: int :arg nb_residues: Number of residues used by MEME :type nb_residues: int :arg first_letters: Number of occurrences of ACGT at the begining of sequences used by MEME :type first_letters: dic of str->int :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM :type motif: :class:`Bio.motifs` :returns: The constructed HMM :rtype: :class:`ghmm.DiscreteEmissionHMM` """ # Starting proba with the starting nucleotides of the sequences and a '1' # pseudocount added initials = [(first_letters['A'] + 1.) / (nb_seq + 4.), (first_letters['C'] + 1.) / (nb_seq + 4.), (first_letters['G'] + 1.) / (nb_seq + 4.), (first_letters['T'] + 1.) / (nb_seq + 4.), 1. / nb_seq, 1. / nb_seq, 1. / nb_seq, 1. / nb_seq] initials += [0.] * 4 * (len(motif) - 1) # Emission proba emissions = [[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.]] * (len(motif) + 1) # Background transitions proba if motif.instances: # The motif.counts is computed directly when creating the motif from # instances nb_hits = len(motif.instances) else: nb_hits = nb_seq background_to_background = 1. - float(nb_seq) / nb_residues background_to_foreground = 1. - background_to_background background_to_background /= 4. transi = {} for letter in "ACGT": freq = (motif.counts[letter][0] + 1.) / (nb_hits + 4.) transi[letter] = freq * background_to_foreground transitions = [] for __ in xrange(4): transitions.append([background_to_background, background_to_background, background_to_background, background_to_background, transi['A'], transi['C'], transi['G'], transi['T']] + [0.] * 4 * (len(motif) - 1)) pfm = [(v + 1.) / (nb_hits + 4.) for letter in 'ACGT' for v in motif.counts[letter]] for position in xrange(len(motif) * 4): transitions.append([0.] * 4 * (len(motif) + 1)) for position in xrange(1, len(motif)): for line in xrange(4 * (position - 1) + 1, 4 * (position - 1) + 5): for column in xrange(4 * position + 1, 4 * position + 5): index = (column - (4 * position + 1)) * len(motif) + position transitions[line + 3][column + 3] = pfm[index] for index in xrange(4): state = len(motif) * 4 + index transitions[state][0] = 0.25 transitions[state][1] = 0.25 transitions[state][2] = 0.25 transitions[state][3] = 0.25 return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET), ghmm.DiscreteDistribution( ghmm.Alphabet(ALPHABET)), transitions, emissions, initials)
def convert2ghmm(h): """Convert an HMM object to a GHMM object.""" alphabets = ghmm.Alphabet("ACDEFGHIKLMNPQRSTVWY") g = ghmm.HMMFromMatrices(alphabets, ghmm.DiscreteDistribution(alphabets), h._t, h._e.T, h._i) return g
start_trans = [0.5, 0.5] + [0] * 12 #end_trans = [0]*13 + [1] doesn't match what we have for yahmm end_trans = [0] * 14 transitions = [ start_trans, m1_trans, i1_trans, d1_trans, m2_trans, i2_trans, d2_trans, m3_trans, i3_trans, d3_trans, m4_trans, i4_trans, d4_trans, end_trans ] emissions = [ start_em, m1_em, i1_em, d1_em, m2_em, i2_em, d2_em, m3_em, i3_em, d3_em, m4_em, i4_em, d4_em, end_em ] ghmm_model = ghmm.HMMFromMatrices(alphabet, ghmm.DiscreteDistribution(alphabet), transitions, emissions, initial_probs) print(ghmm_model) #print("A sample:\n\n") #print(ghmm_model.sampleSingle(30)) #yahmm: yahmm_model = yahmm.Model(name="ProfileHMM") s_d = yahmm.DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}) m_dist1 = yahmm.DiscreteDistribution({ 'A': 0.8, 'C': 0.1, 'G': 0.05, 'T': 0.05 })