def _train(self,
               seq,
               trans,
               emi,
               num_possible_states,
               pseudo_transitions=False,
               start_at_zero=False):
        """Uses the given parameters to train a multinominal HMM to represent
        the given seqences of observations. Uses Baum-Welch training.
        Please override if special training is necessary for your QSR.

        :param seq: the sequence of observations represented by alphabet symbols
        :param trans: the transition matrix as a numpy array
        :param emi: the emission matrix as a numpy array
        :param num_possible_states: the total number of possible states

        :return: the via baum-welch training generated hmm
        """

        print 'Generating HMM:'
        print seq
        print '\tCreating symbols...'
        symbols = self.generate_alphabet(num_possible_states)
        if start_at_zero:
            startprob = np.zeros(num_possible_states)
            startprob[0] = 1
        else:
            startprob = np.ones(num_possible_states)
            startprob = startprob / np.sum(startprob)
        print startprob
        print '\t\t', symbols
        print '\tCreating HMM...'
        hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols),
                                 trans.tolist(), emi.tolist(),
                                 startprob.tolist())
        print '\tTraining...'
        hmm.baumWelch(self._create_sequence_set(seq, symbols))

        if pseudo_transitions:
            print '\tAdding pseudo transitions...'
            pseudo = deepcopy(trans)
            pseudo[pseudo > 0.] = 1.
            pseudo = pseudo / (float(len(seq) + 1))

            trans_trained, emi, start = hmm.asMatrices()
            trans_trained = np.array(trans_trained) + pseudo

            hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols),
                                     trans_trained.tolist(), emi, start)

            hmm.normalize()

        return hmm
Example #2
0
    def __init__(self, preprocess_args, metric, graph_structure_type, A, B, pi,
                 obs_bins, win_len, thresh, min_peak_dist):
        """
        Args:
            preprocess_args:
            metric:
            graph_structure_type: "predefined", "fully", "left_to_right"
            A: initial hidden states graph
            B: initial hidden states distribution
            pi: initial hidden states probabilities
            obs_bins: bins used in the hidden states distribution
            win_len: windows lengths of the sliding window offline
            thresh: in the peak detection, detect peaks that are greater than
                    thresh
            min_peak_dist: in the peak detection, detect peaks that are at
                           least separated by minimum peak distance
        """

        self.preprocess_args = preprocess_args
        self.metric = metric

        self.graph_structure_type = graph_structure_type
        self.A = A
        self.B = B
        self.pi = pi
        self.obs_bins = obs_bins
        self.win_len = win_len
        self.thresh = thresh
        self.min_peak_dist = min_peak_dist

        m = len(self.B[0])  # num of symbols
        self.emission_domain = ghmm.IntegerRange(0, m)
        self.emission_distr = ghmm.DiscreteDistribution(self.emission_domain)
Example #3
0
    def _predict_next(self):
        """@todo: Docstring for _predict_next.
        :returns: @todo

        """
        a_init = normalize_stoch_map(np.random.rand(self._n_hid, self._n_hid))
        b_init = normalize_stoch_map(
            np.random.rand(self._n_hid, self._n_sym**2))
        pi_init = normalize_stoch_map(np.random.rand(self._n_hid))
        hmm = gh.HMMFromMatrices(self._alphab,
                                 gh.DiscreteDistribution(self._alphab), a_init,
                                 b_init, pi_init)
        obs = gh.EmissionSequence(self._alphab, self._memory)
        hmm.baumWelch(obs)

        alpha = hmm.forward(obs)[0][-1]
        trans = hmm.asMatrices()[0]
        alpha = np.dot(alpha, trans)
        next_moves_dist = np.zeros(self._n_sym**2)
        for i in range(self._n_hid):
            next_moves_dist += np.asarray(hmm.getEmission(i)) * alpha[i]
        next_moves_dist = next_moves_dist[self._conversion_array]
        next_move = np.argmax(np.sum(next_moves_dist, axis=0))

        return np.where(self._rules[next_move] == -1)[0][0]
Example #4
0
    def _train(self, seq, trans, emi, num_possible_states):
        """Uses the given parameters to train a multinominal HMM to represent
        the given seqences of observations. Uses Baum-Welch training.
        Please override if special training is necessary for your QSR.

        :param seq: the sequence of observations represented by alphabet symbols
        :param trans: the transition matrix as a numpy array
        :param emi: the emission matrix as a numpy array
        :param num_possible_states: the total number of possible states

        :return: the via baum-welch training generated hmm
        """

        print 'Generating HMM:'
        print '\tCreating symbols...'
        symbols = self._generate_alphabet(num_possible_states)
        startprob = np.zeros(num_possible_states)
        startprob[0] = 1
        print '\t\t', symbols
        print '\tCreating HMM...'
        hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols),
                                 trans.tolist(), emi.tolist(),
                                 startprob.tolist())
        print '\tTraining...'
        hmm.baumWelch(self._create_sequence_set(seq, symbols))

        return hmm
Example #5
0
    def trainHMM(self, seq, trans, emi, qtc_type='qtcc'):
        """Uses the given parameters to train a multinominal HMM to represent the given seqences"""

        if qtc_type == 'qtcb':
            state_num = 11
        elif qtc_type == 'qtcc':
            state_num = 83
        elif qtc_type == 'qtcbc':
            state_num = 92
        else:
            raise (QtcException(
                "trainHMM: Unknow qtc type: {!r}".format(qtc_type)))

        print 'Generating HMM:'
        print '\tCreating symbols...'
        symbols = self.generateAlphabet(state_num)
        startprob = np.zeros((state_num))
        startprob[0] = 1
        print '\t\t', symbols
        print '\tCreating HMM...'
        qtc_hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols),
                                     trans.tolist(), emi.tolist(),
                                     startprob.tolist())
        print '\tTraining...'
        qtc_hmm.baumWelch(self.createSequenceSet(seq, symbols))

        return qtc_hmm
    def train(self):
        # This tells GHMM every possible value that it will be seeing
        alphabet = ghmm.Alphabet(list(set(self.events)))
        alphaLen = len(alphabet)

        # Initiaize the probabilities of transitioning from each state to each other
        # state. There is probably a better way to do this, but this is nice and simple.
        trans_prob = 1.0 / (alphaLen)
        trans = [[trans_prob for row in range(alphaLen)]
                 for col in range(alphaLen)]

        # Initialize the probabilities of seeing each output from each state.
        # Again, there is probably a better way to do this, but this is simple.
        emiss_prob = 1.0 / (alphaLen)
        emiss = [[emiss_prob for row in range(alphaLen)]
                 for col in range(alphaLen)]

        # Some grease to get GHMM to work
        pi = [1.0 / alphaLen] * alphaLen

        # The sequence of musical events gathered from the music
        train_seq = ghmm.EmissionSequence(alphabet, self.events)

        # Generate the model of the data
        m = ghmm.HMMFromMatrices(alphabet, ghmm.DiscreteDistribution(alphabet),
                                 trans, emiss, pi)

        # Train the model based on the training sequence
        m.baumWelch(train_seq)

        return (m, alphabet)
Example #7
0
def create_0order_hmm(nb_seq, nb_residues, first_letters, motif):
    """
    Create a 0-order HMM initialized from MEME result

    :arg nb_seq: Number of sequences used by MEME
    :type nb_seq: int
    :arg nb_residues: Number of residues used by MEME
    :type nb_residues: int
    :arg first_letters: Number of occurrences of ACGT at the begining of
        sequences used by MEME
    :type first_letters: dic of str->int
    :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM
    :type motif: :class:`Bio.motifs`

    :returns: The constructed HMM
    :rtype: :class:`ghmm.DiscreteEmissionHMM`

    """

    # The first state is random
    emissions = [[0.25, 0.25, 0.25, 0.25]]
    # Complete the emissions with the actual motif frequencies
    if motif.instances:
        # The motif.counts is computed directly when creating the motif from
        # instances
        nb_hits = len(motif.instances)
    else:
        nb_hits = nb_seq
    for position in xrange(len(motif)):
        frequencies = []
        for letter in "ACGT":
            freq = (motif.counts[letter][position] + 1.) / (nb_hits + 4.)
            frequencies.append(freq)
        emissions.append(frequencies)

    # Background transitions
    transitions = []
    background_to_background = 1. - float(nb_seq) / nb_residues
    background_to_foreground = 1. - background_to_background
    transitions.append(
        [background_to_background, background_to_foreground] + [0.] *
        (len(motif) - 1))
    # Core transitions
    for position in xrange(1, len(motif)):
        transitions.append(
            [0.] * (position + 1) + [1.] + [0.] * (len(motif) - position - 1))
    # Final transitions now
    transitions.append([1.] + [0.] * len(motif))

    # Starting proba
    initials = [1.] + [0.] * len(motif)
    return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET),
                                ghmm.DiscreteDistribution(
                                    ghmm.Alphabet(ALPHABET)),
                                transitions, emissions, initials)
Example #8
0
 def setUp(self):
     '''Create a simple dice rolling HMM'''
     self.sigma = g.IntegerRange(1, 7)
     self.A = [[0.9, 0.1], [0.3, 0.7]]
     efair = [1.0 / 6] * 6
     eloaded = [3.0 / 13, 3.0 / 13, 2.0 / 13, 2.0 / 13, 2.0 / 13, 1.0 / 13]
     self.B = [efair, eloaded]
     self.pi = [0.5] * 2
     self.m = g.HMMFromMatrices(self.sigma,
                                g.DiscreteDistribution(self.sigma), self.A,
                                self.B, self.pi)
Example #9
0
def ghmm_from_discrete_hmm(hmm):
    hmm = deepcopy(hmm)
    domain = ghmm.Alphabet(range(hmm.alphabetSize))
    trans = hmm.transitionMatrix
    init = hmm.initialProbabilities
    emissions = [d.probabilities for d in hmm.emissionDistributions]
    return ghmm.HMMFromMatrices(emissionDomain=domain,
                                distribution=ghmm.DiscreteDistribution(domain),
                                A=trans,
                                B=emissions,
                                pi=init)
Example #10
0
 def __init__(self, A, B, Pi, observations):
     if len(A) == len(Pi):
         self.states = range(len(A))
         self.sigma = ghmm.Alphabet(observations) # The "alphabet" comprising action indices
         self.initA = A
         self.initB = B
         self.initPi = Pi
         self.ghmmModel = ghmm.HMMFromMatrices(self.sigma, ghmm.DiscreteDistribution(self.sigma), self.initA, self.initB, self.initPi)
     else:
         prettyPrint("Unable to initialize model. Unequal number of states", "error")
         return
    def finalize(self):
        cmodel = self.HMM.finalize()

        if (self.modeltype & ghmmwrapper.kContinuousHMM):
            return ghmm.ContinuousMixtureHMM(
                ghmm.Float(), ghmm.ContinuousMixtureDistribution(ghmm.Float()),
                cmodel)

        elif ((self.modeltype & ghmmwrapper.kDiscreteHMM)
              and not (self.modeltype & ghmmwrapper.kTransitionClasses)
              and not (self.modeltype & ghmmwrapper.kPairHMM)):
            emission_domain = ghmm.Alphabet([], cmodel.alphabet)
            if (self.modeltype & ghmmwrapper.kLabeledStates):
                labelDomain = ghmm.LabelDomain([], cmodel.label_alphabet)
                return ghmm.StateLabelHMM(
                    emission_domain,
                    ghmm.DiscreteDistribution(emission_domain), labelDomain,
                    cmodel)

            else:
                return ghmm.DiscreteEmissionHMM(
                    emission_domain,
                    ghmm.DiscreteDistribution(emission_domain), cmodel)
Example #12
0
 def trainHMM(hmmState):
     ''' Train HMM with the given chromosome. '''
     print >> sys.stderr, printTime(), "Train HMM with one chromosome."
     T = [[0.9, 0.1], [0.1, 0.9]]
     e1 = [0.1, 0.9]
     e0 = [0.9, 0.1]
     E = [e0, e1]
     pi = [0.9, 0.1]  # initial 10% are peak?
     sigma = ghmm.IntegerRange(0, 2)  # 0, 1
     m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), T, E,
                              pi)
     m.baumWelch(ghmm.EmissionSequence(sigma, hmmState))
     print >> sys.stderr, printTime(), "Train HMM finished."
     print >> sys.stderr
     return m
Example #13
0
    def __create_hmm_from_dict(self, dictionary, qsr_type, num_symbols):
        """Creates a hmm from the xml representation. Not nice to use tempfile
        but not otherwise possible due to hidden code and swig in ghmm.

        :param xml: The xml string

        :return: the ghmm hmm object
        """
        symbols = self.hmm_types_available[qsr_type]().generate_alphabet(
            num_symbols)
        hmm = gh.HMMFromMatrices(symbols, gh.DiscreteDistribution(symbols),
                                 dictionary[self.TRANS], dictionary[self.EMI],
                                 dictionary[self.START])

        return hmm
Example #14
0
def train_model(songs_data):
    """Input: list of data on several songs (could be a single song)
       Ouput: a list of models, one for each bar type. """
    note_models = {}
    notes = get_notes(songs_data)

    # This tells GHMM every possible value that it will be seeing
    note_alphabet = ghmm.Alphabet(list(set(notes)))
    note_alpha_len = len(note_alphabet)

    # Initiaize the probabilities of transitioning from each state to each other
    # state. There is probably a better way to do this, but this is nice and simple.
    note_trans_prob = 1.0 / (note_alpha_len)
    trans = [[note_trans_prob for row in range(note_alpha_len)]
             for col in range(note_alpha_len)]

    # Initialize the probabilities of seeing each output from each state.
    # Again, there is probably a better way to do this, but this is simple.
    note_emiss_prob = 1.0 / (note_alpha_len)
    emiss = [[note_emiss_prob for row in range(note_alpha_len)]
             for col in range(note_alpha_len)]

    # Some grease to get GHMM to work
    pi = [1.0 / note_alpha_len] * note_alpha_len

    # The sequence of notes gathered from the music
    note_train_seq = ghmm.EmissionSequence(note_alphabet, notes)

    bars = BarLearner.get_bars(songs_data)
    for bar in set(bars):
        # Generate the model of the data
        note_models[bar] = ghmm.HMMFromMatrices(
            note_alphabet, ghmm.DiscreteDistribution(note_alphabet), trans,
            emiss, pi)

    for bar in bars:
        # Train the model based on the training sequence
        note_models[bar].baumWelch(note_train_seq)

    return (note_models, note_alphabet)
Example #15
0
# %%
import ghmm

# %%
sigma = ghmm.IntegerRange(1, 7)

train_seq = ghmm.SequenceSet(
    sigma,
    [[1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1]])

A = [[0.99, 0.01], [0.99, 0.01]]

B = [[1.0 / 6] * 6] * 2

pi = [0.5] * 2

m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, pi)

m.baumWelch(train_seq, 100000000, 0.000000000000001)

print(m.asMatrices())
# %%
print(map(sigma.external, m.sampleSingle(20)))
# %%
v = m.viterbi(test_seq)
print v

# %%
my_seq = ghmm.EmissionSequence(sigma, [1] * 20 + [6] * 10 + [1] * 40)
print m.viterbi(my_seq)
Example #16
0
def extractHMMFeatures(sourceFiles):
    """ Extracts HMM-similarity features from all files in a given directory """
    allTraces = [] # List to store all traces for the HMM-similarity extraction     
    try:
        for targetFile in sourceFiles:
            if os.path.exists(targetFile.replace(".c", ".seq")):
                instructionAlphaSequence = open(targetFile.replace(".c", ".seq")).read()
                allTraces.append( (instructionAlphaSequence, targetFile.replace(".c", ".hmm"), loadLabelFromFile(targetFile.replace(".c", ".metadata"))[0])) #TODO: Append a tuple of (trace, filename, cluster) for each data sample
        if len(allTraces) < 1:
            prettyPrint("No traces to process for HMM-similarity feature extraction. Skipping", "warning")
        else:
            allClusters = []
            # Retrieve list of clusters
            prettyPrint("Retrieving clusters")
            for trace in allTraces:
                if not trace[2] in allClusters:
                    allClusters.append(trace[2])
            # Gather traces belonging to different clusters
            clusterTraces = []
            for cluster in allClusters:
                currentCluster = []
                for trace in allTraces:
                    if trace[2] == cluster:
                        currentCluster.append(trace[0])
                clusterTraces.append(currentCluster)
                prettyPrint("Retrieved %s instances for cluster %s" % (len(currentCluster), cluster))
            # Should wind up with list of lists each of which depict traces of a cluster
            allHMMs = []
            for cluster in allClusters:
                # Build HMM for each cluster and use it to calculate likelihoods for all instances
                prettyPrint("Building HMM for cluster \"%s\"" % cluster)
                trainingSequences =  clusterTraces[ allClusters.index(cluster) ]
                # Retrieve number of observations
                observations = []
                for sequence in trainingSequences:
                    for o in sequence:
                        if o not in observations:
                            observations.append(o)
                # Prepare matrices for HMM
                A = numpy.random.random((len(allClusters), len(allClusters))).tolist()
                B = numpy.random.random((len(allClusters), len(observations))).tolist()
                Pi = numpy.random.random((len(allClusters),)).tolist()
                sigma = ghmm.Alphabet(observations)
                # Build HMM and train it using Baum-Welch algorithm
                clusterHMM = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, Pi)
                clusterHMM.baumWelch(ghmm.SequenceSet(clusterHMM.emissionDomain, trainingSequences))
                # Add that to list of all HMM's
                allHMMs.append((clusterHMM, observations))
            # Finally, for every trace, calculate the feature vectors
            prettyPrint("Calculating similarity features for traces")
            for trace in allTraces:
                featureVector = []
                for hmm in allHMMs:
                    # Make sure sequences contains observations supported by the current HMM
                    sequence = []
                    for obs in trace[0]:
                        if obs in hmm[1]:
                            sequence.append(obs)
                    # Calculate the likelihood
                    sequence = ghmm.EmissionSequence(ghmm.Alphabet(hmm[1]), sequence)
                    featureVector.append(hmm[0].loglikelihood(sequence))
                    featureFile = open(trace[1], "w")
                    featureFile.write(str(featureVector))
                    featureFile.close()
        #############################################################################

    except Exception as e:
        prettyPrint("Error encoutered: %s" % e, "error")
        return False
        
    return True 
Example #17
0
                sigma = ghmm.IntegerRange(0, vocab_len)  # Emission range

                # Transition Matrix
                A = calculate_transition_probabilities(n_components)

                # Emission Probabilities
                B = calculate_emission_probabilities(train_data, n_components,
                                                     vocab_len)

                # Initial State Distribution
                pi = [
                    1.0 / n_components
                ] * n_components  # Equally distribute the starting probabilities

                m = ghmm.HMMFromMatrices(sigma,
                                         ghmm.DiscreteDistribution(sigma), A,
                                         B, pi)

                m.baumWelch(
                    ghmm.SequenceSet(sigma, train_data),
                    nrSteps=1000,
                    loglikelihoodCutoff=0.00005
                )  # Defaults: nrSteps=500, loglikelihoodCutoff=0.0001
                # print('Training Done')

                # print(m.asMatrices()[0])
                # print(m.asMatrices()[1])
                # print(m.asMatrices()[2])

                total_checked = 0
                total_correct = 0
Example #18
0
def create_detailed_hmm(nb_seq, nb_residues, first_letters, motif):
    """
    Create a detailed HMM initialized from MEME result

    :arg nb_seq: Number of sequences used by MEME
    :type nb_seq: int
    :arg nb_residues: Number of residues used by MEME
    :type nb_residues: int
    :arg first_letters: Number of occurrences of ACGT at the begining of
        sequences used by MEME
    :type first_letters: dic of str->int
    :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM
    :type motif: :class:`Bio.motifs`

    :returns: The constructed HMM
    :rtype: :class:`ghmm.DiscreteEmissionHMM`

    """

    # Starting proba with the starting nucleotides of the sequences and a '1'
    # pseudocount added
    initials = [(first_letters['A'] + 1.) / (nb_seq + 4.),
                (first_letters['C'] + 1.) / (nb_seq + 4.),
                (first_letters['G'] + 1.) / (nb_seq + 4.),
                (first_letters['T'] + 1.) / (nb_seq + 4.),
                1. / nb_seq, 1. / nb_seq, 1. / nb_seq, 1. / nb_seq]
    initials += [0.] * 4 * (len(motif) - 1)

    # Emission proba
    emissions = [[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.],
                 [0., 0., 0., 1.]] * (len(motif) + 1)

    # Background transitions proba
    if motif.instances:
        # The motif.counts is computed directly when creating the motif from
        # instances
        nb_hits = len(motif.instances)
    else:
        nb_hits = nb_seq
    background_to_background = 1. - float(nb_seq) / nb_residues
    background_to_foreground = 1. - background_to_background
    background_to_background /= 4.
    transi = {}
    for letter in "ACGT":
        freq = (motif.counts[letter][0] + 1.) / (nb_hits + 4.)
        transi[letter] = freq * background_to_foreground
    transitions = []
    for __ in xrange(4):
        transitions.append([background_to_background,
                            background_to_background,
                            background_to_background,
                            background_to_background,
                            transi['A'], transi['C'],
                            transi['G'], transi['T']]
                           + [0.] * 4 * (len(motif) - 1))
    pfm = [(v + 1.) / (nb_hits + 4.) for letter in 'ACGT' for v in motif.counts[letter]]
    for position in xrange(len(motif) * 4):
        transitions.append([0.] * 4 * (len(motif) + 1))
    for position in xrange(1, len(motif)):
        for line in xrange(4 * (position - 1) + 1, 4 * (position - 1) + 5):
            for column in xrange(4 * position + 1, 4 * position + 5):
                index = (column - (4 * position + 1)) * len(motif) + position
                transitions[line + 3][column + 3] = pfm[index]
    for index in xrange(4):
        state = len(motif) * 4 + index
        transitions[state][0] = 0.25
        transitions[state][1] = 0.25
        transitions[state][2] = 0.25
        transitions[state][3] = 0.25
    return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET),
                                ghmm.DiscreteDistribution(
                                    ghmm.Alphabet(ALPHABET)),
                                transitions, emissions, initials)
Example #19
0
File: hmm.py Project: ncryer/hmm
def convert2ghmm(h):
    """Convert an HMM object to a GHMM object."""
    alphabets = ghmm.Alphabet("ACDEFGHIKLMNPQRSTVWY")
    g = ghmm.HMMFromMatrices(alphabets, ghmm.DiscreteDistribution(alphabets),
            h._t, h._e.T, h._i)
    return g
Example #20
0
start_trans = [0.5, 0.5] + [0] * 12
#end_trans = [0]*13 + [1]  doesn't match what we have for yahmm
end_trans = [0] * 14

transitions = [
    start_trans, m1_trans, i1_trans, d1_trans, m2_trans, i2_trans, d2_trans,
    m3_trans, i3_trans, d3_trans, m4_trans, i4_trans, d4_trans, end_trans
]

emissions = [
    start_em, m1_em, i1_em, d1_em, m2_em, i2_em, d2_em, m3_em, i3_em, d3_em,
    m4_em, i4_em, d4_em, end_em
]

ghmm_model = ghmm.HMMFromMatrices(alphabet,
                                  ghmm.DiscreteDistribution(alphabet),
                                  transitions, emissions, initial_probs)

print(ghmm_model)
#print("A sample:\n\n")
#print(ghmm_model.sampleSingle(30))

#yahmm:
yahmm_model = yahmm.Model(name="ProfileHMM")
s_d = yahmm.DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25})
m_dist1 = yahmm.DiscreteDistribution({
    'A': 0.8,
    'C': 0.1,
    'G': 0.05,
    'T': 0.05
})