def learn_likelihoods_progress(i, n, m, A, B, pi, F, X_train, nEmissionDim, g_mu, g_sig, nState):
    if nEmissionDim >= 2:
        ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi)
    else:
        ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi)

    l_likelihood_mean = 0.0
    l_likelihood_mean2 = 0.0
    l_statePosterior = np.zeros(nState)

    for j in xrange(n):    

        g_post = np.zeros(nState)
        g_lhood = 0.0
        g_lhood2 = 0.0
        prop_sum = 0.0

        for k in xrange(1, m):
            final_ts_obj = ghmm.EmissionSequence(F, X_train[j][:k*nEmissionDim])
            logp = ml.loglikelihoods(final_ts_obj)[0]
            # print 'Log likelihood:', logp
            post = np.array(ml.posterior(final_ts_obj))

            k_prop = norm(loc=g_mu, scale=g_sig).pdf(k)
            g_post += post[k-1] * k_prop
            g_lhood += logp * k_prop
            g_lhood2 += logp * logp * k_prop

            prop_sum  += k_prop

        l_statePosterior += g_post / prop_sum / float(n)
        l_likelihood_mean += g_lhood / prop_sum / float(n)
        l_likelihood_mean2 += g_lhood2 / prop_sum / float(n)

    return i, l_statePosterior, l_likelihood_mean, np.sqrt(l_likelihood_mean2 - l_likelihood_mean**2)
    def set_hmm_object(self,
                       A,
                       B,
                       pi,
                       out_a_num=None,
                       vec_num=None,
                       mat_num=None,
                       u_denom=None):
        """Set HMM's hyper parameters
        """
        if self.nEmissionDim == 1:
            self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                           A, B, pi)
        else:
            self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \
                                           A, B, pi)
        self.A = A
        self.B = B
        self.pi = pi

        try:
            self.ml.setBaumWelchParams(out_a_num, vec_num, mat_num, u_denom)
        except:
            print "Install Daehyung's custom ghmm if you want partial fit functionalities."

        return self.ml
Exemple #3
0
    def __init__(self, preprocess_args, metric, graph_structure_type, A, B, pi,
                 win_len, thresh, min_peak_dist):
        """
        Args:
            preprocess_args:
            metric:
            graph_structure_type: "predefined", "fully", "left_to_right"
            A: initial hidden states graph
            B: initial hidden states distribution
            pi: initial hidden states probabilities
            win_len: windows lengths of the sliding window offline
            thresh: in the peak detection, detect peaks that are greater than
                    thresh
            min_peak_dist: in the peak detection, detect peaks that are at
                           least separated by minimum peak distance

        """

        self.preprocess_args = preprocess_args
        self.metric = metric

        self.graph_structure_type = graph_structure_type
        self.A = A
        self.B = B
        self.pi = pi
        self.win_len = win_len
        self.thresh = thresh
        self.min_peak_dist = min_peak_dist

        self.emission_domain = ghmm.Float()
        self.emission_distr = ghmm.GaussianDistribution(self.emission_domain)
Exemple #4
0
def newModel(states, randomize = True, startAtFirstState = False, \
            feedForward = True):
    """newModel(states, obs, sigma)
    Make a new random model.
    """
    pi = [1.0 / states] * states

    if startAtFirstState:
        pi = [0] * states
        pi[0] = 1

    aMat = numpy.zeros((states, states), float)
    bMat = numpy.zeros((states, 2), float)

    if randomize:
        for i in range(states):
            for j in range(states):
                aMat[i][j] = random.random()
                if feedForward and (j != i + 1):
                    aMat[i][j] = 0
                if feedForward and (j == i + 1):
                    aMat[i][j] = 1

            for j in range(2):
                bMat[i][j] = random.random()

    aMat += 0.01
    bMat += 0.01

    m = ghmm.HMMFromMatrices(ghmm.Float(), \
                                ghmm.GaussianDistribution(ghmm.Float()), \
                                aMat, bMat, pi)
    return m
    def create_model(self, flag, number_states):
          
        A, B, pi = self.calculate_A_B_pi(number_states, flag)

        # generate models from parameters
        model = ghmm.HMMFromMatrices(self.F,ghmm.GaussianDistribution(self.F), A, B, pi)
        #model = ghmm.HMMFromMatrices(F,ghmm.MultivariateGaussianDistribution(F), A, B, pi)
        return model
    def fit(self, X_train, A=None, B=None, pi=None, B_dict=None, verbose=False):

        if A is None:        
            if verbose: print "Generate new A matrix"                
            # Transition probability matrix (Initial transition probability, TODO?)
            A = self.init_trans_mat(self.nState).tolist()

        if B is None:
            if verbose: print "Generate new B matrix"                                            
            # We should think about multivariate Gaussian pdf.        
            self.mu, self.sig = self.vectors_to_mean_sigma(X_train, self.nState)

            # Emission probability matrix
            B = np.hstack([self.mu, self.sig]).tolist() # Must be [i,:] = [mu, sig]
                
        if pi is None:            
            # pi - initial probabilities per state 
            ## pi = [1.0/float(self.nState)] * self.nState
            pi = [0.] * self.nState
            pi[0] = 1.0

        # HMM model object
        self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), A, B, pi)
        
        ## print "Run Baum Welch method with (samples, length)", X_train.shape
        train_seq = X_train.tolist()
        final_seq = ghmm.SequenceSet(self.F, train_seq)        
        self.ml.baumWelch(final_seq, 10000)

        [self.A,self.B,self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)

        ## self.mean_path_plot(mu[:,0], sigma[:,0])        
        ## print "Completed to fitting", np.array(final_seq).shape
        
        # state range
        self.state_range = np.arange(0, self.nState, 1)

        # Pre-computation for PHMM variables
        self.mu_z   = np.zeros((self.nState))
        self.mu_z2  = np.zeros((self.nState))
        self.mu_z3  = np.zeros((self.nState))
        self.var_z  = np.zeros((self.nState))
        self.sig_z3 = np.zeros((self.nState))
        for i in xrange(self.nState):
            zp             = self.A[i,:]*self.state_range
            self.mu_z[i]   = np.sum(zp)
            self.mu_z2[i]  = self.mu_z[i]**2
            #self.mu_z3[i]  = self.mu_z[i]**3
            self.var_z[i]  = np.sum(zp*self.state_range) - self.mu_z[i]**2
def computeLikelihood(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=1, \
                      bPosterior=False, converted_X=False, cov_type='full'):
    '''
    This function will be deprecated. Please, use computeLikelihoods.
    '''

    if nEmissionDim >= 2:
        ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F),
                                  A, B, pi)
        if cov_type == 'diag' or cov_type.find('diag') >= 0:
            ml.setDiagonalCovariance(1)
    else:
        ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi)

    if converted_X is False:
        X_test = util.convert_sequence(X, emission=False)
        X_test = np.squeeze(X_test)
        X_test = X_test.tolist()
    else:
        X_test = X

    l_idx = []
    l_likelihood = []
    l_posterior = []

    for i in xrange(startIdx, len(X_test) / nEmissionDim):
        final_ts_obj = ghmm.EmissionSequence(F, X_test[:i * nEmissionDim])

        try:
            logp = ml.loglikelihood(final_ts_obj)
            if bPosterior: post = np.array(ml.posterior(final_ts_obj))
        except:
            print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?"

            l_idx.append(i)
            l_likelihood.append(-100000000)
            if bPosterior:
                if len(l_posterior) == 0: l_posterior.append(list(pi))
                else: l_posterior.append(l_posterior[-1])
            ## return False, False # anomaly
            continue

        l_idx.append(i)
        l_likelihood.append(logp)
        if bPosterior: l_posterior.append(post[i - 1])

    if bPosterior:
        return idx, l_idx, l_likelihood, l_posterior
    else:
        return idx, l_idx, l_likelihood
Exemple #8
0
def ghmm_from_gaussian_hmm(hmm):
    hmm = deepcopy(hmm)
    domain = ghmm.Float()
    trans = hmm.transitionMatrix.tolist()
    init = hmm.initialProbabilities.tolist()
    emissions = [map(float, [d.mean, d.variance]) for d in hmm.emissionDistributions]
    # print init
    # print trans
    # print emissions
    return ghmm.HMMFromMatrices(emissionDomain=domain,
                                distribution=ghmm.GaussianDistribution(domain),
                                A=trans,
                                B=emissions,
                                pi=init)
    def reset(self):
        """Reset the HMM object
        """
        [A, B, pi] = self.ml.asMatrices()

        if self.nEmissionDim == 1:
            self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                           A, B, pi)
        else:
            self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \
                                           A, B, pi)
        self.A = A
        self.B = B
        self.pi = pi
    def predict_from_single_seq(self, x, ref_num):
        '''
        Input
        @ x: length #samples x known steps
        Output
        @ observation distribution: nDimension
        '''

        # new emission for partial sequence
        B = []
        for i in xrange(self.nState):
            B.append([
                self.B[i][0][ref_num],
                self.B[i][1][ref_num * self.nEmissionDim + ref_num]
            ])

        ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                  self.A, B, self.pi)

        if type(x) is not list: x = x.tolist()
        final_ts_obj = ghmm.EmissionSequence(self.F, x)

        try:
            (alpha, scale) = ml.forward(final_ts_obj)
        except:
            print "No alpha is available !!"
            sys.exit()

        x_pred = []
        for i in xrange(self.nEmissionDim):
            if i == ref_num:
                x_pred.append(x[-1])
            else:
                src_cov_idx = ref_num * self.nEmissionDim + ref_num
                tgt_cov_idx = ref_num * self.nEmissionDim + i

                t_o = 0.0
                for j in xrange(self.nState):
                    m_j = self.B[j][0][i] + \
                      self.B[j][1][tgt_cov_idx]/self.B[j][1][src_cov_idx]*\
                      (x[-1]-self.B[j][0][ref_num])
                    t_o += alpha[-1][j] * m_j
                x_pred.append(t_o)

        return x_pred
def computeLikelihoods(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=2, \
                       bPosterior=False, converted_X=False, cov_type='full'):
    '''
    Input:
    - X: dimension x length
    '''

    if nEmissionDim >= 2:
        ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F),
                                  A, B, pi)
        if cov_type == 'diag': ml.setDiagonalCovariance(1)
    else:
        ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi)

    X_test = util.convert_sequence(X, emission=False)
    X_test = np.squeeze(X_test)

    l_idx = []
    l_likelihood = []
    l_posterior = []

    for i in xrange(startIdx, len(X[0])):
        final_ts_obj = ghmm.EmissionSequence(
            F, X_test[:i * nEmissionDim].tolist())

        try:
            logp = ml.loglikelihood(final_ts_obj)
            if bPosterior: post = np.array(ml.posterior(final_ts_obj))
            l_likelihood.append(logp)
            if bPosterior: l_posterior.append(post[i - 1])
        except:
            print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?"
            ## return False, False # anomaly
            ## continue
            # we keep the state as the previous one
            l_likelihood.append(-1000000000000)
            if bPosterior:
                if len(l_posterior) == 0: l_posterior.append(list(pi))
                else: l_posterior.append(l_posterior[-1])

        l_idx.append(i)

    if bPosterior: return idx, l_idx, l_likelihood, l_posterior
    else: return idx, l_idx, l_likelihood
def computeLikelihood(F, k, data, g_mu, g_sig, nEmissionDim, A, B, pi):
    if nEmissionDim >= 2:
        hmm_ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi)
    else:
        hmm_ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi)

    final_ts_obj = ghmm.EmissionSequence(F, data)
    logp = hmm_ml.loglikelihoods(final_ts_obj)[0]
    post = np.array(hmm_ml.posterior(final_ts_obj))

    k_prop = norm(loc=g_mu, scale=g_sig).pdf(k)
    g_post = post[k-1] * k_prop
    g_lhood = logp * k_prop
    g_lhood2 = logp * logp * k_prop
    prop_sum = k_prop

    # print np.shape(g_post), np.shape(g_lhood), np.shape(g_lhood2), np.shape(prop_sum)

    return g_post, g_lhood, g_lhood2, prop_sum
    def conditional_prob2(self, x):
        '''
        Input
        @ x: dim x length
        Output
        @ A list of conditional probabilities P(x_t|lambda)

        Only single sample works
        '''
        from scipy.stats import norm, entropy

        # feature-wise conditional probability
        cond_prob = []
        for i in xrange(self.nEmissionDim):  # per feature

            B = [0] * self.nState
            for j in xrange(self.nState):
                B[j] = [
                    self.B[j][0][i], self.B[j][1][i * self.nEmissionDim + i]
                ]

            ml_src = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                          self.A, B, self.pi)

            X_test = util.convert_sequence2(x[[i]], emission=False)
            X_test = np.squeeze(X_test)
            final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist())
            logp = ml_src.loglikelihood(final_ts_obj)

            cond_prob.append(logp)

        ## # all
        ## X_test = util.convert_sequence2(x, emission=False)
        ## X_test = np.squeeze(X_test)
        ## final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist())
        ## cond_prob.append( self.ml.loglikelihood(final_ts_obj) )

        # min-max normalization
        cond_prob = np.array(cond_prob)

        return cond_prob
Exemple #14
0
def _randomModels(k, states):
    """Make a set of k random models.  These models are untrained with 
    initial random values for all model matricies.
    """
    f = ghmm.Float()
    pi = [0.1] * states

    aMat = numpy.zeros((states, states), float)
    bMat = numpy.zeros((states, 2), float)
    #TODO Change above for multivariate Gaussians

    models = []

    for n in range(k):
        for i in range(states):
            for j in range(states):
                aMat[i][j] = random.random()
            for j in range(2):
                bMat[i][j] = random.random()
        m = ghmm.HMMFromMatrices(f, ghmm.GaussianDistribution(f), \
                                aMat, bMat, pi)
        models.append(m)

    return models
Exemple #15
0
def obsToModel(observation, std=0.1):
    """Makes a model from a single observation vector.
    """

    aMat = numpy.zeros((len(observation), len(observation)), float)
    bMat = numpy.zeros((len(observation), 2), float)
    pi = [0.05] * len(observation)
    pi[0] = 1.0

    for i in range(len(observation)):
        bMat[i][0] = observation[i]
        bMat[i][1] = std

        for j in range(len(observation)):
            aMat[i][j] = random.random() * 0.3
            if j == i + 1:
                aMat[i][j] = 0.9

    m = ghmm.HMMFromMatrices(ghmm.Float(), \
                                ghmm.GaussianDistribution(ghmm.Float()), \
                                aMat, bMat, pi)
    m.normalize()

    return m
Exemple #16
0
import ghmm
import numpy
import random

numModels = 1
states = 5
obs = 2
f = ghmm.Float()
pi = [0.1] * states

aMat = numpy.zeros((states, states), float)
bMat = numpy.zeros((states, obs), float)

for i in range(states):
    for j in range(states):
        aMat[i][j] = random.random()
    for j in range(obs):
        bMat[i][j] = random.random()
bMat[0][0] = 5
bMat[0][1] = 3
model = ghmm.HMMFromMatrices(f, ghmm.GaussianDistribution(f), aMat, bMat, pi)



            
Exemple #17
0
def _kMeans(data, k, states, iterations = 20, stopThreshold = 0.01, \
            rOutliers = True, printBest = True, verbose = True, \
            iType = "kmeans++"):

    bestScore = -100
    bestModels = None
    bestData = None
    oldScore = -100
    models = []

    tdata = _randomAssign(data, k)
    if iType == "random":
        models = _randomModels(k, states)
        models = _trainModels(tdata, models)
    if iType == "kmeans++":
        models = _initializeGoodModels(data, k, states)
    tdata = _optimalAssign(tdata, models)
    outliers = []

    for i in range(iterations):
        models = _trainModels(tdata, models)
        score = _fitness(tdata, models)
        if verbose:
            print "  " + str(i) + ":  " + str(score)

        if (score > bestScore) or (bestScore == -100):
            bestScore = score
            bestModels = list(ghmm.HMMFromMatrices(ghmm.Float(), \
                                ghmm.GaussianDistribution(ghmm.Float()), \
                                m.asMatrices()[0], \
                                m.asMatrices()[1], \
                                m.asMatrices()[2]) for m in models)
            bestData = list(list(v) for v in tdata)
            bestOutliers = list(outliers)

        if (oldScore == -100) or (score - oldScore) > stopThreshold:
            tdata = _optimalAssign(tdata, models)
            oldScore = score

            if rOutliers:
                _removeOutliers(models, tdata)
        else:
            if verbose:
                print "Resetting all"
            tdata = _randomAssign(data, k)
            if iType == "random":
                models = _randomModels(k, states)
                models = _trainModels(tdata, models)
            if iType == "kmeans++":
                models = _initializeGoodModels(data, k, states)
            tdata = _optimalAssign(tdata, models)

            oldScore = -100

    if printBest or verbose:
        print "Average inter-cluster distance:" + str(bestScore)

    if rOutliers:
        if verbose:
            print "Number outliers found:" + str(len(bestOutliers))

        #For the best set of models and train data try to include any outliers
        #again.  Then return the models, data and outliers.
        bestData, bestOutliers = _includeOutliers(bestModels, bestData,
                                                  bestOutliers)
        bestModels = _trainModels(bestData, bestModels)
        bestData = _optimalAssign(bestData, bestModels)
        score = _fitness(bestData, bestModels)

        if printBest or verbose:
            print "Score with additional outliers:" + str(score)
        if verbose:
            print "New number of outliers:" + str(len(bestOutliers))

    import pybb.model.hmm
    bm = []
    for m in bestModels:
        bm.append(pybb.model.hmm.Hmm(m))

    return bm, bestData, bestOutliers
    def fit(self, xData1, A=None, B=None, pi=None, cov_mult=[1.0]*1, verbose=False, \
            ml_pkl='ml_temp_1d.pkl', use_pkl=False):
        ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl)
        X1 = np.array(xData1)

        if A is None:
            if verbose: print "Generating a new A matrix"
            # Transition probability matrix (Initial transition probability, TODO?)
            A = self.init_trans_mat(self.nState).tolist()

            # print 'A', A

        if B is None:
            if verbose: print "Generating a new B matrix"
            # We should think about multivariate Gaussian pdf.

            mu, sig = self.vectors_to_mean_sigma(X1, self.nState)
            B = np.vstack([mu, sig * cov_mult[0]
                           ]).T.tolist()  # Must be [i,:] = [mu, sig]

        if pi is None:
            # pi - initial probabilities per state
            ## pi = [1.0/float(self.nState)] * self.nState
            pi = [0.0] * self.nState
            pi[0] = 1.0

        # print 'Generating HMM'
        # HMM model object
        self.ml = ghmm.HMMFromMatrices(self.F,
                                       ghmm.GaussianDistribution(self.F), A, B,
                                       pi)
        X_train = X1.tolist()

        print 'Run Baum Welch method with (samples, length)', np.shape(X_train)
        final_seq = ghmm.SequenceSet(self.F, X_train)
        ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0)
        ret = self.ml.baumWelch(final_seq, 10000)
        print 'Baum Welch return:', ret
        if np.isnan(ret): return 'Failure'

        [self.A, self.B, self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)

        #--------------- learning for anomaly detection ----------------------------
        [A, B, pi] = self.ml.asMatrices()
        n, m = np.shape(X1)
        self.nGaussian = self.nState

        # Get average loglikelihood threshold wrt progress
        self.std_coff = 1.0
        g_mu_list = np.linspace(0, m - 1,
                                self.nGaussian)  #, dtype=np.dtype(np.int16))
        g_sig = float(m) / float(self.nGaussian) * self.std_coff

        # print 'g_mu_list:', g_mu_list
        # print 'g_sig:', g_sig

        ######################################################################################
        if os.path.isfile(ml_pkl) and use_pkl:
            with open(ml_pkl, 'rb') as f:
                d = pickle.load(f)
                self.l_statePosterior = d[
                    'state_post']  # time x state division
                self.ll_mu = d['ll_mu']
                self.ll_std = d['ll_std']
        else:
            if self.cluster_type == 'time':
                print 'Begining parallel job'
                r = Parallel(n_jobs=-1)(delayed(learn_likelihoods_progress)(
                    i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim,
                    g_mu_list[i], g_sig, self.nState)
                                        for i in xrange(self.nGaussian))
                # r = [self.learn_likelihoods_progress_par(i, n, m, A, B, pi, X_train, g_mu_list[i], g_sig) for i in xrange(self.nGaussian)]
                print 'Completed parallel job'
                l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r)
            elif self.cluster_type == 'state':
                self.km = None
                self.ll_mu = None
                self.ll_std = None
                self.ll_mu, self.ll_std = self.state_clustering(X1)
                path_mat = np.zeros((self.nState, m * n))
                likelihood_mat = np.zeros((1, m * n))
                self.l_statePosterior = None

            d = dict()
            d['state_post'] = self.l_statePosterior
            d['ll_mu'] = self.ll_mu
            d['ll_std'] = self.ll_std
            ut.save_pickle(d, ml_pkl)
    def fit(self, xData, A=None, B=None, pi=None, cov_mult=None,
            ml_pkl=None, use_pkl=False, cov_type='full', fixed_trans=0,\
            shuffle=False):
        '''
        Input :
        - xData: dimension x sample x length
        Issues:
        - If NaN is returned, the reason can be one of followings,
        -- lower cov
        -- small range of xData (you have to scale it up.)
        '''

        # Daehyung: What is the shape and type of input data?
        if shuffle:
            X = xData
            X = np.swapaxes(X, 0, 1)
            id_list = range(len(X))
            random.shuffle(id_list)
            X = np.array(X)[id_list]
            X = np.swapaxes(X, 0, 1)
        else:
            X = [np.array(data) for data in xData]
        nData = len(xData[0])

        param_dict = {}

        # Load pre-trained HMM without training
        if use_pkl and ml_pkl is not None and os.path.isfile(ml_pkl):
            if self.verbose: print "Load HMM parameters without train the hmm"

            param_dict = ut.load_pickle(ml_pkl)
            self.A = param_dict['A']
            self.B = param_dict['B']
            self.pi = param_dict['pi']
            if self.nEmissionDim == 1:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                               self.A, self.B, self.pi)
            else:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \
                                               self.A, self.B, self.pi)

            out_a_num = param_dict.get('out_a_num', None)
            vec_num = param_dict.get('vec_num', None)
            mat_num = param_dict.get('mat_num', None)
            u_denom = param_dict.get('u_denom', None)
            if out_a_num is not None:
                self.ml.setBaumWelchParams(out_a_num, vec_num, mat_num,
                                           u_denom)

            return True
        else:

            if ml_pkl is None:
                ml_pkl = os.path.join(os.path.dirname(__file__),
                                      'ml_temp_n.pkl')

            if cov_mult is None:
                cov_mult = [1.0] * (self.nEmissionDim**2)

            if A is None:
                if self.verbose: print "Generating a new A matrix"
                # Transition probability matrix (Initial transition probability, TODO?)
                A = util.init_trans_mat(self.nState).tolist()

            if B is None:
                if self.verbose: print "Generating a new B matrix"
                # We should think about multivariate Gaussian pdf.

                mus, cov = util.vectors_to_mean_cov(X,
                                                    self.nState,
                                                    self.nEmissionDim,
                                                    cov_type=cov_type)
                ## print np.shape(mus), np.shape(cov)

                # cov: state x dim x dim
                for i in xrange(self.nEmissionDim):
                    for j in xrange(self.nEmissionDim):
                        cov[:, i, j] *= cov_mult[self.nEmissionDim * i + j]

                if self.verbose:
                    for i, mu in enumerate(mus):
                        print 'mu%i' % i, mu
                    ## print 'cov', cov

                # Emission probability matrix
                B = [0] * self.nState
                for i in range(self.nState):
                    if self.nEmissionDim > 1:
                        B[i] = [[mu[i] for mu in mus]]
                        B[i].append(cov[i].flatten().tolist())
                    else:
                        B[i] = [np.squeeze(mus[0][i]), float(cov[i])]
            if pi is None:
                # pi - initial probabilities per state
                ## pi = [1.0/float(self.nState)] * self.nState
                pi = [0.0] * self.nState
                pi[0] = 1.0

            # print 'Generating HMM'
            # HMM model object
            if self.nEmissionDim == 1:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                               A, B, pi)
            else:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \
                                               A, B, pi)
            if cov_type == 'diag': self.ml.setDiagonalCovariance(1)

            # print 'Creating Training Data'
            X_train = util.convert_sequence(X)  # Training input
            X_train = X_train.tolist()
            if self.verbose: print "training data size: ", np.shape(X_train)

            if self.verbose:
                print 'Run Baum Welch method with (samples, length)', np.shape(
                    X_train)
            final_seq = ghmm.SequenceSet(self.F, X_train)
            ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0)
            ret = self.ml.baumWelch(final_seq,
                                    10000)  #, fixedTrans=fixed_trans)
            if np.isnan(ret):
                print 'Baum Welch return:', ret
                return 'Failure'
            print 'Baum Welch return:', ret / float(nData)

            [self.A, self.B, self.pi] = self.ml.asMatrices()
            self.A = np.array(self.A)
            self.B = np.array(self.B)

            param_dict['A'] = self.A
            param_dict['B'] = self.B
            param_dict['pi'] = self.pi

            try:
                [out_a_num, vec_num, mat_num,
                 u_denom] = self.ml.getBaumWelchParams()
                param_dict['out_a_num'] = out_a_num
                param_dict['vec_num'] = vec_num
                param_dict['mat_num'] = mat_num
                param_dict['u_denom'] = u_denom
            except:
                print "Install new ghmm!!"

            if ml_pkl is not None: ut.save_pickle(param_dict, ml_pkl)
            return ret / float(nData)
Exemple #20
0
# state 4: definitive-non-T state

if params['mixed_model'] == True:
    Emissionmatrix = [[[params['bound'] * 100.0, 0.0], [1.0, 1.0], [1.0, 0.0]],
                      [[1.5, -1.0], [1.5, 1.5], [0.95, 0.05]],
                      [[1.5, -1.0], [1.5, 1.5], [0.75, 0.25]],
                      [[1.5, -1.0], [1.5, 1.5], [0.5, 0.5]],
                      [[1.5, -1.0], [1.5, 1.5], [0.25, 0.75]]]
    # [p1_mean, p2,mean], [p1_std, p2_std], [P(p1), P(p2)]
    model = ghmm.HMMFromMatrices(F, ghmm.GaussianMixtureDistribution(F),
                                 Transitionmatrix, Emissionmatrix, pi)
else:
    Emissionmatrix = [[params['bound'] * 100.0, 1.0], [2.0, 0.5], [1.0, 0.5],
                      [-1.0, 0.5], [-2.0, 0.5]]
    # [mean, std]
    model = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F),
                                 Transitionmatrix, Emissionmatrix, pi)

print('Model before training:')
print(model)
mghmm_train = ghmm.SequenceSet(F, train_set)
model.baumWelch(mghmm_train, 10000, 0.01)
print('Model after training:')
print(model)
model.write(out_hmm)

###------------------------------------------------

###------------------------------------------------
# calculate tail length using the mghmm model and write them to output files
# dict_tl structure: {gene_name : [list of tail lengths]}