def create_0order_hmm(nb_seq, nb_residues, first_letters, motif): """ Create a 0-order HMM initialized from MEME result :arg nb_seq: Number of sequences used by MEME :type nb_seq: int :arg nb_residues: Number of residues used by MEME :type nb_residues: int :arg first_letters: Number of occurrences of ACGT at the begining of sequences used by MEME :type first_letters: dic of str->int :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM :type motif: :class:`Bio.motifs` :returns: The constructed HMM :rtype: :class:`ghmm.DiscreteEmissionHMM` """ # The first state is random emissions = [[0.25, 0.25, 0.25, 0.25]] # Complete the emissions with the actual motif frequencies if motif.instances: # The motif.counts is computed directly when creating the motif from # instances nb_hits = len(motif.instances) else: nb_hits = nb_seq for position in xrange(len(motif)): frequencies = [] for letter in "ACGT": freq = (motif.counts[letter][position] + 1.) / (nb_hits + 4.) frequencies.append(freq) emissions.append(frequencies) # Background transitions transitions = [] background_to_background = 1. - float(nb_seq) / nb_residues background_to_foreground = 1. - background_to_background transitions.append( [background_to_background, background_to_foreground] + [0.] * (len(motif) - 1)) # Core transitions for position in xrange(1, len(motif)): transitions.append( [0.] * (position + 1) + [1.] + [0.] * (len(motif) - position - 1)) # Final transitions now transitions.append([1.] + [0.] * len(motif)) # Starting proba initials = [1.] + [0.] * len(motif) return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET), ghmm.DiscreteDistribution( ghmm.Alphabet(ALPHABET)), transitions, emissions, initials)
def train(self): # This tells GHMM every possible value that it will be seeing alphabet = ghmm.Alphabet(list(set(self.events))) alphaLen = len(alphabet) # Initiaize the probabilities of transitioning from each state to each other # state. There is probably a better way to do this, but this is nice and simple. trans_prob = 1.0 / (alphaLen) trans = [[trans_prob for row in range(alphaLen)] for col in range(alphaLen)] # Initialize the probabilities of seeing each output from each state. # Again, there is probably a better way to do this, but this is simple. emiss_prob = 1.0 / (alphaLen) emiss = [[emiss_prob for row in range(alphaLen)] for col in range(alphaLen)] # Some grease to get GHMM to work pi = [1.0 / alphaLen] * alphaLen # The sequence of musical events gathered from the music train_seq = ghmm.EmissionSequence(alphabet, self.events) # Generate the model of the data m = ghmm.HMMFromMatrices(alphabet, ghmm.DiscreteDistribution(alphabet), trans, emiss, pi) # Train the model based on the training sequence m.baumWelch(train_seq) return (m, alphabet)
def _get_posterior_proba(self, sequence_split): """ Get the posterior probabilities at each nucleotide position given the TFFM. :arg sequence_split: The sequence splitted in subsequences to not consider non ACGT nucleotides. :type sequence_split: list :returns: The posterior probabilities at each position of the sequence. :rtype: list of list :note: One example of a sequence_split is ["ACT", "N", "ATC"]. """ ghmm_extended_alphabet = ghmm.Alphabet(EXTENDED_ALPHABET) posterior_proba = [] # null probabilities for non ACGT nucleotides. null_proba = [0.] * self.N for sequence in sequence_split: if re.match("[ACGT]", sequence): emission_sequence = ghmm.SequenceSet(ghmm_extended_alphabet, [sequence])[0] posterior_proba.extend(self.posterior(emission_sequence)) else: for __ in xrange(len(sequence)): posterior_proba.append(null_proba) return posterior_proba
def train(self, training_file, epsilon=0.0001, max_iter=500): """ Train the TFFM using the fasta sequences to learn emission and transition probabilities. :note: The training of the underlying HMM is made using the Baum-Welsh algorithm. :arg training_file: The fasta file of the sequences to train the TFFM on. :type training_file: str :arg epsilon: The least relative improvement cut-off in likelihood compared to the previous iteration of the Baum-Welsh algorithm (default: 0.0001). :type epsilon: float :arg max_iter: The maximum number of iteration of the Baum-Welsh algorithm to reestimate the probabilities (default: 500). :type max_iter: int """ assert(os.path.isfile(training_file)) # Only upper case is allowed in the ALPHABET, need to convert sequences = [] for record in SeqIO.parse(training_file, "fasta"): sequence = record.seq.upper() # Only considering sequences with ACGTs if not re.search("[^AGCT]", str(sequence)): sequences.append(sequence) training_sequences = ghmm.SequenceSet(ghmm.Alphabet(ALPHABET), sequences) # Need to give the same weight to all the sequences since it does not # seem to be done by default by ghmm. utils.set_sequences_weight(training_sequences, 1.0) self.baumWelch(training_sequences, max_iter, epsilon)
def __init__(self, A, B, Pi, observations): if len(A) == len(Pi): self.states = range(len(A)) self.sigma = ghmm.Alphabet(observations) # The "alphabet" comprising action indices self.initA = A self.initB = B self.initPi = Pi self.ghmmModel = ghmm.HMMFromMatrices(self.sigma, ghmm.DiscreteDistribution(self.sigma), self.initA, self.initB, self.initPi) else: prettyPrint("Unable to initialize model. Unequal number of states", "error") return
def ghmm_from_discrete_hmm(hmm): hmm = deepcopy(hmm) domain = ghmm.Alphabet(range(hmm.alphabetSize)) trans = hmm.transitionMatrix init = hmm.initialProbabilities emissions = [d.probabilities for d in hmm.emissionDistributions] return ghmm.HMMFromMatrices(emissionDomain=domain, distribution=ghmm.DiscreteDistribution(domain), A=trans, B=emissions, pi=init)
def __init__(self, n_hid=10, **kwargs): """@todo: to be defined1. :param **kwargs: @todo """ RPSPlayer.__init__(self, **kwargs) self._n_hid = n_hid self._n_sym = len(self._rules) sym_pairs = [(i, j) for i in range(self._n_sym) for j in range(self._n_sym)] self._alphab = gh.Alphabet(sym_pairs) self._conversion_array = np.asarray( [[self._alphab.internal((i, j)) for j in range(self._n_sym)] for i in range(self._n_sym)]) self._prediction_mode = False
def test_viterbi_against_hmm(self): from kerehmm.test.util import ghmm_from_discrete_hmm import ghmm hmm = self.new_hmm() hmm.setup_strict_left_to_right(set_emissions=True) domain = ghmm.Alphabet(range(hmm.alphabetSize)) hmm_reference = ghmm_from_discrete_hmm(hmm) seq = list(range(self.nSymbols)) print "True path and emission: {}".format(seq) true_path = seq reference_path, reference_prob = hmm_reference.viterbi( ghmm.SequenceSet(domain, [seq])) path, prob = hmm.viterbi_path(seq) print "Reference path: {}".format(reference_path) print "Calculated path: {}".format(path) print "Reference prob: {}, Calculated prob: {}".format( reference_prob, prob) assert np.all(np.equal(true_path, reference_path)) assert np.all(np.equal(true_path, path)) assert np.isclose(prob, reference_prob)
def train_model(songs_data): """Input: list of data on several songs (could be a single song) Ouput: a list of models, one for each bar type. """ note_models = {} notes = get_notes(songs_data) # This tells GHMM every possible value that it will be seeing note_alphabet = ghmm.Alphabet(list(set(notes))) note_alpha_len = len(note_alphabet) # Initiaize the probabilities of transitioning from each state to each other # state. There is probably a better way to do this, but this is nice and simple. note_trans_prob = 1.0 / (note_alpha_len) trans = [[note_trans_prob for row in range(note_alpha_len)] for col in range(note_alpha_len)] # Initialize the probabilities of seeing each output from each state. # Again, there is probably a better way to do this, but this is simple. note_emiss_prob = 1.0 / (note_alpha_len) emiss = [[note_emiss_prob for row in range(note_alpha_len)] for col in range(note_alpha_len)] # Some grease to get GHMM to work pi = [1.0 / note_alpha_len] * note_alpha_len # The sequence of notes gathered from the music note_train_seq = ghmm.EmissionSequence(note_alphabet, notes) bars = BarLearner.get_bars(songs_data) for bar in set(bars): # Generate the model of the data note_models[bar] = ghmm.HMMFromMatrices( note_alphabet, ghmm.DiscreteDistribution(note_alphabet), trans, emiss, pi) for bar in bars: # Train the model based on the training sequence note_models[bar].baumWelch(note_train_seq) return (note_models, note_alphabet)
def finalize(self): cmodel = self.HMM.finalize() if (self.modeltype & ghmmwrapper.kContinuousHMM): return ghmm.ContinuousMixtureHMM( ghmm.Float(), ghmm.ContinuousMixtureDistribution(ghmm.Float()), cmodel) elif ((self.modeltype & ghmmwrapper.kDiscreteHMM) and not (self.modeltype & ghmmwrapper.kTransitionClasses) and not (self.modeltype & ghmmwrapper.kPairHMM)): emission_domain = ghmm.Alphabet([], cmodel.alphabet) if (self.modeltype & ghmmwrapper.kLabeledStates): labelDomain = ghmm.LabelDomain([], cmodel.label_alphabet) return ghmm.StateLabelHMM( emission_domain, ghmm.DiscreteDistribution(emission_domain), labelDomain, cmodel) else: return ghmm.DiscreteEmissionHMM( emission_domain, ghmm.DiscreteDistribution(emission_domain), cmodel)
def __init__(self, allele): ''' Initialize the predictor. For now, only H2-IAb is supported. ''' if (allele != 'H2-IAb'): raise ValueError('Unsupported allele') self.allele = allele # range of allowed peptide lengths (percentile rank calibration was # done for this range) self.minPeptideLength = 12 self.maxPeptideLength = 24 # define the HMM emission alphabet, including a dedicated "stop" # emission (a GHMM-specific tweak) self.alphabet = ghmm.Alphabet(ghmm.AminoAcids.listOfCharacters + ['Z']) # a value to replace inf in the predictions. Same value as used in # percentile rank calibration self.infSubstitute = 1000 # GHMM has an internal limitation of 1,500,000 sequences in a set self.sequencesPerBlock = 1500000 # load the HMM model and the corresponding percentile rank calibration data from pkg_resources import resource_filename hmmFile = resource_filename(__name__, 'models/hmm-h2iab-iedb2018-binders.xml') prCalibrationFile = resource_filename( __name__, 'models/precent-rank-model-h2iab.npz') self.hmm = ghmm.HMMOpen(hmmFile) prCalibration = np.load(prCalibrationFile) self.prCalibrationCdf = prCalibration['cdf'] self.prCalibrationBinEdges = prCalibration['bin_edges']
pi=init) def ghmm_from_multivariate_continuous_hmm(hmm): hmm = deepcopy(hmm) domain = ghmm.Float() trans = hmm.transitionMatrix.tolist() init = hmm.initialProbabilities.tolist() emissions = [[d.mean.tolist(), d.variance.flatten().tolist()] for d in hmm.emissionDistributions] # print init # print trans # print emissions return ghmm.HMMFromMatrices(emissionDomain=domain, distribution=ghmm.MultivariateGaussianDistribution(domain), A=trans, B=emissions, pi=init) if __name__ == "__main__": from discrete_hmm_test import DiscreteHMMTest test = DiscreteHMMTest() hmm = test.new_hmm() domain = ghmm.Alphabet(range(hmm.alphabetSize)) hmm_reference = ghmm_from_discrete_hmm(hmm) seq = ghmm.EmissionSequence(hmm_reference.emissionDomain, [0, 0, 0]) print hmm_reference print hmm_reference.forward(seq)[0] print hmm_reference.backward(seq)[0]
def extractHMMFeatures(sourceFiles): """ Extracts HMM-similarity features from all files in a given directory """ allTraces = [] # List to store all traces for the HMM-similarity extraction try: for targetFile in sourceFiles: if os.path.exists(targetFile.replace(".c", ".seq")): instructionAlphaSequence = open(targetFile.replace(".c", ".seq")).read() allTraces.append( (instructionAlphaSequence, targetFile.replace(".c", ".hmm"), loadLabelFromFile(targetFile.replace(".c", ".metadata"))[0])) #TODO: Append a tuple of (trace, filename, cluster) for each data sample if len(allTraces) < 1: prettyPrint("No traces to process for HMM-similarity feature extraction. Skipping", "warning") else: allClusters = [] # Retrieve list of clusters prettyPrint("Retrieving clusters") for trace in allTraces: if not trace[2] in allClusters: allClusters.append(trace[2]) # Gather traces belonging to different clusters clusterTraces = [] for cluster in allClusters: currentCluster = [] for trace in allTraces: if trace[2] == cluster: currentCluster.append(trace[0]) clusterTraces.append(currentCluster) prettyPrint("Retrieved %s instances for cluster %s" % (len(currentCluster), cluster)) # Should wind up with list of lists each of which depict traces of a cluster allHMMs = [] for cluster in allClusters: # Build HMM for each cluster and use it to calculate likelihoods for all instances prettyPrint("Building HMM for cluster \"%s\"" % cluster) trainingSequences = clusterTraces[ allClusters.index(cluster) ] # Retrieve number of observations observations = [] for sequence in trainingSequences: for o in sequence: if o not in observations: observations.append(o) # Prepare matrices for HMM A = numpy.random.random((len(allClusters), len(allClusters))).tolist() B = numpy.random.random((len(allClusters), len(observations))).tolist() Pi = numpy.random.random((len(allClusters),)).tolist() sigma = ghmm.Alphabet(observations) # Build HMM and train it using Baum-Welch algorithm clusterHMM = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, Pi) clusterHMM.baumWelch(ghmm.SequenceSet(clusterHMM.emissionDomain, trainingSequences)) # Add that to list of all HMM's allHMMs.append((clusterHMM, observations)) # Finally, for every trace, calculate the feature vectors prettyPrint("Calculating similarity features for traces") for trace in allTraces: featureVector = [] for hmm in allHMMs: # Make sure sequences contains observations supported by the current HMM sequence = [] for obs in trace[0]: if obs in hmm[1]: sequence.append(obs) # Calculate the likelihood sequence = ghmm.EmissionSequence(ghmm.Alphabet(hmm[1]), sequence) featureVector.append(hmm[0].loglikelihood(sequence)) featureFile = open(trace[1], "w") featureFile.write(str(featureVector)) featureFile.close() ############################################################################# except Exception as e: prettyPrint("Error encoutered: %s" % e, "error") return False return True
action='store', help='output FASTA file of shuffled sequences') parser.add_argument('--prefix', default='', action='store', help='add this prefix to the sequence ids') options = parser.parse_args() fin = open(options.infasta, 'r') if (options.outfasta != '-'): fout = open(options.outfasta, 'w') else: fout = sys.stdout sigma = ghmm.Alphabet(['A', 'C', 'G', 'T']) #import ipdb; ipdb.set_trace() # HMM with transition probabilities learned from the dinucleotide frequencies # # A->A, A->C, A->G, A->T # C->A, C->C, C->G, C->T # G->A, G->C, G->G, G->T # T->A, T->C, T->G, T->T # # emission probabilities # # A->A (1.0), A->C (0.0), A->G (0.0), A->T (0.0) # C->A (0.0), C->C (1.0), C->G (0.0), C->T (0.0) # G->A (0.0), G->C (0.0), G->G (1.0), G->T (0.0)
def convert2ghmm(h): """Convert an HMM object to a GHMM object.""" alphabets = ghmm.Alphabet("ACDEFGHIKLMNPQRSTVWY") g = ghmm.HMMFromMatrices(alphabets, ghmm.DiscreteDistribution(alphabets), h._t, h._e.T, h._i) return g
def create_detailed_hmm(nb_seq, nb_residues, first_letters, motif): """ Create a detailed HMM initialized from MEME result :arg nb_seq: Number of sequences used by MEME :type nb_seq: int :arg nb_residues: Number of residues used by MEME :type nb_residues: int :arg first_letters: Number of occurrences of ACGT at the begining of sequences used by MEME :type first_letters: dic of str->int :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM :type motif: :class:`Bio.motifs` :returns: The constructed HMM :rtype: :class:`ghmm.DiscreteEmissionHMM` """ # Starting proba with the starting nucleotides of the sequences and a '1' # pseudocount added initials = [(first_letters['A'] + 1.) / (nb_seq + 4.), (first_letters['C'] + 1.) / (nb_seq + 4.), (first_letters['G'] + 1.) / (nb_seq + 4.), (first_letters['T'] + 1.) / (nb_seq + 4.), 1. / nb_seq, 1. / nb_seq, 1. / nb_seq, 1. / nb_seq] initials += [0.] * 4 * (len(motif) - 1) # Emission proba emissions = [[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.]] * (len(motif) + 1) # Background transitions proba if motif.instances: # The motif.counts is computed directly when creating the motif from # instances nb_hits = len(motif.instances) else: nb_hits = nb_seq background_to_background = 1. - float(nb_seq) / nb_residues background_to_foreground = 1. - background_to_background background_to_background /= 4. transi = {} for letter in "ACGT": freq = (motif.counts[letter][0] + 1.) / (nb_hits + 4.) transi[letter] = freq * background_to_foreground transitions = [] for __ in xrange(4): transitions.append([background_to_background, background_to_background, background_to_background, background_to_background, transi['A'], transi['C'], transi['G'], transi['T']] + [0.] * 4 * (len(motif) - 1)) pfm = [(v + 1.) / (nb_hits + 4.) for letter in 'ACGT' for v in motif.counts[letter]] for position in xrange(len(motif) * 4): transitions.append([0.] * 4 * (len(motif) + 1)) for position in xrange(1, len(motif)): for line in xrange(4 * (position - 1) + 1, 4 * (position - 1) + 5): for column in xrange(4 * position + 1, 4 * position + 5): index = (column - (4 * position + 1)) * len(motif) + position transitions[line + 3][column + 3] = pfm[index] for index in xrange(4): state = len(motif) * 4 + index transitions[state][0] = 0.25 transitions[state][1] = 0.25 transitions[state][2] = 0.25 transitions[state][3] = 0.25 return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET), ghmm.DiscreteDistribution( ghmm.Alphabet(ALPHABET)), transitions, emissions, initials)
#author: Anas Elghafari #building a toy profile HMM in two libraries, GHMM and YAHMM, #with the purpose of comparing the viterbi calculations in both libraries from yahmm import * import ghmm random.seed(0) #needed for yahmm ?? #ghmm model: alphabet = ghmm.Alphabet(['A', 'C', 'G', 'T']) initial_probs = ([1] + [0] * 13) start_em = end_em = [0.25, 0.25, 0.25, 0.25] m1_em = [0.8, 0.1, 0.05, 0.05] m2_em = [0.8, 0.1, 0.05, 0.05] m3_em = [0.1, 0.8, 0.05, 0.05] m4_em = [0.1, 0.8, 0.05, 0.05] i1_em = i2_em = i3_em = i4_em = [0.25, 0.25, 0.25, 0.25] d1_em = d2_em = d3_em = d4_em = [0, 0, 0, 0] #transitions: m1_trans = [0, 0, 0.2, 0.2, 0.6] + ([0] * 9) m2_trans = [0] * 5 + [0.2, 0.2, 0.6] + [0] * 6 m3_trans = [0] * 8 + [0.2, 0.2, 0.6] + [0] * 3 m4_trans = [0] * 11 + [0.2, 0.2, 0.6] i1_trans = [0, 0, 0.2, 0, 0.8] + [0] * 9 i2_trans = [0] * 5 + [0.2, 0, 0.8] + [0] * 6 i3_trans = [0] * 8 + [0.2, 0, 0.8] + [0] * 3 i4_trans = [0] * 11 + [0.2, 0, 0.8] d1_trans = [0] * 2 + [0.1, 0, 0.9] + [0] * 9 d2_trans = [0] * 5 + [0.1, 0, 0.9] + [0] * 6