def loglikelihoods_from_seqs(self,
                                 X,
                                 bPosterior=False,
                                 bIdx=False,
                                 startIdx=1):
        '''
        X: sample x (length*dim)
        return: the likelihoods over time (in single data)
        '''
        ll_likelihoods = []
        ll_posteriors = []
        for i in xrange(len(X)):
            l_likelihood = []
            l_posterior = []

            for j in xrange(startIdx, len(X[i]) / self.nEmissionDim):

                if isinstance(X[i], np.ndarray) or isinstance(X[i], list):
                    try:
                        final_ts_obj = ghmm.EmissionSequence(
                            self.F, X[i, :j * self.nEmissionDim].tolist())
                    except:
                        print "failed to make sequence"
                        continue
                else:
                    final_ts_obj = ghmm.EmissionSequence(
                        self.F,
                        list(X[i])[:j * self.nEmissionDim])

                try:
                    logp = self.ml.loglikelihood(final_ts_obj)
                    if bPosterior:
                        post = np.array(self.ml.posterior(final_ts_obj))
                except:
                    print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?"
                    continue
                    ## return False, False # anomaly
                    #continue

                l_likelihood.append(logp)
                if bPosterior: l_posterior.append(post[j - 1])

            ll_likelihoods.append(l_likelihood)
            if bPosterior: ll_posteriors.append(l_posterior)

        if bIdx:
            ll_idx = []
            for ii in xrange(len(X)):
                l_idx = []
                for jj in xrange(startIdx, len(X[ii]) / self.nEmissionDim):
                    l_idx.append(jj)
                ll_idx.append(l_idx)

            if bPosterior: return ll_likelihoods, ll_posteriors, ll_idx
            else: return ll_likelihoods, ll_idx
        else:
            if bPosterior: return ll_likelihoods, ll_posteriors
            else: return ll_likelihoods
Ejemplo n.º 2
0
def _includeOutliers(models, trainData, outliers):

    means = []
    stds = []

    for i in range(len(models)):

        mean = 0
        variance = 0

        #Calculate model mean
        for tmp in trainData[i]:
            eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp)
            a = abs(models[i].loglikelihood(eSeq))
            #print a
            mean += a

        mean /= (len(trainData[i]) * 1.0)

        means.append(mean)

        #Calculate the model variance
        for tmp in trainData[i]:
            eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp)
            v = abs(models[i].loglikelihood(eSeq))
            variance += (mean - v)**2

        variance /= (len(trainData[i]) * 1.0)
        std = variance**0.5

        stds.append(std)

    #For each data element in outliers, check for the model that it most
    #fits.  If the outlier fits the model within one standard deviation
    #include it back into the data.
    for tmp in outliers:
        eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp)
        best = -1
        bestModel = -1
        for j in range(len(models)):
            val = abs(models[j].loglikelihood(eSeq))

            if val < best or best == -1:
                best = val
                bestModel = j

        #Determine if the best fit is "good" enough.
        #If it is, add the outlier back into the model
        if (best - means[bestModel]) < 1 * (stds[bestModel]):
            trainData[bestModel].append(tmp)
            outliers.remove(tmp)

    return trainData, outliers
Ejemplo n.º 3
0
    def hidden_state_change_lik(self, ts):
        """
        obs.: compare distribution of different states. If they are too similar
        than they are merged during the probability computation
        """

        obs_seq = self.get_obs_seqs(ts.y)
        emission_seq = ghmm.EmissionSequence(self.emission_domain, obs_seq)

        # ALERT: I DON'T KNOW WHAT scale IS
        forward, scale = self.model.forward(emission_seq)
        backward = self.model.backward(emission_seq, scale)
        n = len(self.A)

        ts_hidden_state_change_lik = time_series.dist_ts(ts)
        for t in xrange(len(ts.y) - 1):
            lik = 0.0
            for i in xrange(n):
                for j in xrange(n):
                    if self.states_are_diff(i, j):
                        lik += self.xi(n, forward, backward, ts.y[t + 1], i, j,
                                       t)
            ts_hidden_state_change_lik.x.append(ts.x[t])
            ts_hidden_state_change_lik.y.append(lik)

        return ts_hidden_state_change_lik
    def predict2(self, X, x1, x2, x3, x4):
        X = np.squeeze(X)
        X_test = X.tolist()        
        n = len(X_test)

        mu_l = np.zeros(self.nEmissionDim)
        cov_l = np.zeros(self.nEmissionDim**2)

        final_ts_obj = ghmm.EmissionSequence(self.F, X_test + [x1, x2, x3, x4])

        try:
            (alpha, scale) = self.ml.forward(final_ts_obj)
        except:
            if self.verbose: print "No alpha is available !!"
            sys.exit()

        alpha = np.array(alpha)

        for j in xrange(self.nState):
            
            [[mu1, mu2, mu3, mu4], [cov11, cov12, cov13, cov14, cov21, cov22, cov23, cov24,
                                    cov31, cov32, cov33, cov34, cov41, cov42, cov43, cov44]] = self.B[j]

            mu_l[0] = x1
            mu_l[1] += alpha[n/self.nEmissionDim, j] * (mu2 + cov21/cov11*(x1 - mu1) )
            mu_l[2] += alpha[n/self.nEmissionDim, j] * (mu3 + cov31/cov21*(x2 - mu2)) # TODO Where does this come from?
            mu_l[3] += alpha[n/self.nEmissionDim, j] * (mu4 + cov41/cov31*(x3 - mu3)) # TODO Where does this come from?
            ## cov_l[0] += (cov11)*(total**2)
            ## cov_l[1] += (cov12)*(total**2)
            ## cov_l[2] += (cov21)*(total**2)
            ## cov_l[3] += (cov22)*(total**2)

        return mu_l, cov_l
    def allLikelihoods(self, X1, X2, X3, X4):
        # n, m = np.shape(X1)
        X_test = self.convert_sequence(X1, X2, X3, X4, emission=False)
        # i = m - 1

        m = len(np.squeeze(X1))

        ll_likelihood = np.zeros(m)
        ll_state_idx  = np.zeros(m)
        ll_likelihood_mu  = np.zeros(m)
        ll_likelihood_std = np.zeros(m)
        for i in xrange(1, m):
            final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0,:i*self.nEmissionDim].tolist())
            logp = self.ml.loglikelihood(final_ts_obj)
            post = np.array(self.ml.posterior(final_ts_obj))

            # Find the best posterior distribution
            min_index, min_dist = self.findBestPosteriorDistribution(post[i-1])

            ll_likelihood[i] = logp
            ll_state_idx[i]  = min_index
            ll_likelihood_mu[i]  = self.ll_mu[min_index]
            ll_likelihood_std[i] = self.ll_std[min_index] #self.ll_mu[min_index] + ths_mult*self.ll_std[min_index]

        return ll_likelihood, ll_state_idx, ll_likelihood_mu, ll_likelihood_std
    def predict2(self, X, x1):
        X = np.squeeze(X)
        X_test = X.tolist()
        n = len(X_test)

        mu_l = np.zeros(self.nEmissionDim)
        cov_l = np.zeros(self.nEmissionDim**2)

        final_ts_obj = ghmm.EmissionSequence(self.F, X_test + [x1])

        try:
            (alpha, scale) = self.ml.forward(final_ts_obj)
        except:
            print "No alpha is available !!"
            sys.exit()

        alpha = np.array(alpha)

        for j in xrange(self.nState):

            [[mu1], [cov11]] = self.B[j]

            mu_l[0] = x1

        return mu_l, cov_l
Ejemplo n.º 7
0
    def test_backward_against_ghmm(self):
        from kerehmm.test.util import ghmm_from_discrete_hmm
        import ghmm
        hmm = self.new_hmm(random_emissions=True, random_transitions=True)
        hmm_reference = ghmm_from_discrete_hmm(hmm)
        observation_size = 10
        observed = np.random.choice(range(self.nSymbols),
                                    size=observation_size).tolist()
        seq = ghmm.EmissionSequence(hmm_reference.emissionDomain, observed)
        _, scale = hmm.forward(observations=observed)
        # remember that we have to convert stuff from ghmm to log scale
        _, scale_reference = map(np.array, hmm_reference.forward(seq))
        # print "Forward referece", forward
        print "Scale reference", scale_reference
        assert np.allclose(scale, scale_reference)

        # this is the reference backward array, untransformed (scaled)
        backward_reference = np.array(
            hmm_reference.backward(seq, scalingVector=scale_reference))
        print "Backward reference (scaled)", backward_reference

        backward = hmm.backward(observed, scale_coefficients=scale)

        print "Backward", backward

        assert backward.shape == backward_reference.shape

        # test values
        # print "Diff:", np.exp(backward) - backward_reference
        # backward_unscaled = np.exp(backward)
        assert np.allclose(backward, backward_reference)
Ejemplo n.º 8
0
    def forecast(self, data, future = 1):
        """Forecast for a model the probability of each observation.

        equation is:
        p(o_t+1) = sum_j(p(o_t+1|s_t+1^j)p(s_t+1^j)
        where
        p(s_t+1^j) is found through forward algorithm
        """
        state = self.model.asMatrices()[0]
        observe = self.model.asMatrices()[1]
        ps1 = [0.0] * len(state[0])
        po1 = [0.0] * len(observe[0])

        tmp = ghmm.EmissionSequence(ghmm.Float(), data)

        ps = self.model.forward(tmp)[0][-1]

        for j in range(len(ps1)):
            for i in range(len(ps)):
                ps1[j] += state[i][j] * ps[i]

        for k in range(len(po1)):
            for j in range(len(ps1)):
                po1[k] += observe[j][k]*ps1[j]

        return po1[0]
    def anomaly_check(self, X1, ths_mult=None):
        if self.nEmissionDim == 1: X_test = np.array([X1])
        else: X_test = self.convert_sequence(X1, emission=False)

        try:
            final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist())
            logp = self.ml.loglikelihood(final_ts_obj)
        except:
            if self.verbose:
                print "Too different input profile that cannot be expressed by emission matrix"
            return -1, 0.0  # error

        try:
            post = np.array(self.ml.posterior(final_ts_obj))
        except:
            if self.verbose:
                print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?"
            return True, 0.0  # anomaly

        n = len(np.squeeze(X1))

        # Find the best posterior distribution
        min_index, min_dist = self.findBestPosteriorDistribution(post[n - 1])

        if (type(ths_mult) == list or type(ths_mult) == np.ndarray
                or type(ths_mult) == tuple) and len(ths_mult) > 1:
            err = logp - (self.ll_mu[min_index] +
                          ths_mult[min_index] * self.ll_std[min_index])
        else:
            err = logp - (self.ll_mu[min_index] +
                          ths_mult * self.ll_std[min_index])

        if err < self.anomaly_offset: return True, err
        else: return False, err
Ejemplo n.º 10
0
def learn_likelihoods_progress(i, n, m, A, B, pi, F, X_train, nEmissionDim, g_mu, g_sig, nState):
    if nEmissionDim >= 2:
        ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F), A, B, pi)
    else:
        ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi)

    l_likelihood_mean = 0.0
    l_likelihood_mean2 = 0.0
    l_statePosterior = np.zeros(nState)

    for j in xrange(n):    

        g_post = np.zeros(nState)
        g_lhood = 0.0
        g_lhood2 = 0.0
        prop_sum = 0.0

        for k in xrange(1, m):
            final_ts_obj = ghmm.EmissionSequence(F, X_train[j][:k*nEmissionDim])
            logp = ml.loglikelihoods(final_ts_obj)[0]
            # print 'Log likelihood:', logp
            post = np.array(ml.posterior(final_ts_obj))

            k_prop = norm(loc=g_mu, scale=g_sig).pdf(k)
            g_post += post[k-1] * k_prop
            g_lhood += logp * k_prop
            g_lhood2 += logp * logp * k_prop

            prop_sum  += k_prop

        l_statePosterior += g_post / prop_sum / float(n)
        l_likelihood_mean += g_lhood / prop_sum / float(n)
        l_likelihood_mean2 += g_lhood2 / prop_sum / float(n)

    return i, l_statePosterior, l_likelihood_mean, np.sqrt(l_likelihood_mean2 - l_likelihood_mean**2)
Ejemplo n.º 11
0
    def train(self):
        # This tells GHMM every possible value that it will be seeing
        alphabet = ghmm.Alphabet(list(set(self.events)))
        alphaLen = len(alphabet)

        # Initiaize the probabilities of transitioning from each state to each other
        # state. There is probably a better way to do this, but this is nice and simple.
        trans_prob = 1.0 / (alphaLen)
        trans = [[trans_prob for row in range(alphaLen)]
                 for col in range(alphaLen)]

        # Initialize the probabilities of seeing each output from each state.
        # Again, there is probably a better way to do this, but this is simple.
        emiss_prob = 1.0 / (alphaLen)
        emiss = [[emiss_prob for row in range(alphaLen)]
                 for col in range(alphaLen)]

        # Some grease to get GHMM to work
        pi = [1.0 / alphaLen] * alphaLen

        # The sequence of musical events gathered from the music
        train_seq = ghmm.EmissionSequence(alphabet, self.events)

        # Generate the model of the data
        m = ghmm.HMMFromMatrices(alphabet, ghmm.DiscreteDistribution(alphabet),
                                 trans, emiss, pi)

        # Train the model based on the training sequence
        m.baumWelch(train_seq)

        return (m, alphabet)
Ejemplo n.º 12
0
    def expLikelihoods(self, X, ths_mult=None):
        if self.nEmissionDim == 1: X_test = np.array([X[0]])
        else: X_test = self.convert_sequence(X, emission=False)

        try:
            final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist())
            logp = self.ml.loglikelihood(final_ts_obj)
        except:
            print "Too different input profile that cannot be expressed by emission matrix"
            return -1, 0.0 # error

        try:
            post = np.array(self.ml.posterior(final_ts_obj))
        except:
            print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?"
            return 1.0, 0.0 # anomaly

        n = len(np.squeeze(X[0]))

        # Find the best posterior distribution
        min_index, min_dist = self.findBestPosteriorDistribution(post[n-1])

        # print 'Computing anomaly'
        # print logp
        # print self.ll_mu[min_index]
        # print self.ll_std[min_index]

        # print 'logp:', logp, 'll_mu', self.ll_mu[min_index], 'll_std', self.ll_std[min_index], 'mult_std', ths_mult*self.ll_std[min_index]

        if (type(ths_mult) == list or type(ths_mult) == np.ndarray or type(ths_mult) == tuple) and len(ths_mult)>1:
            ## print min_index, self.ll_mu[min_index], self.ll_std[min_index], ths_mult[min_index], " = ", (self.ll_mu[min_index] + ths_mult[min_index]*self.ll_std[min_index]) 
            return self.ll_mu[min_index] + ths_mult[min_index]*self.ll_std[min_index]
        else:
            return self.ll_mu[min_index] + ths_mult*self.ll_std[min_index]
Ejemplo n.º 13
0
    def predict2(self, X, x1, x2):
        X = np.squeeze(X)
        X_test = X.tolist()
        n = len(X_test)

        mu_l = np.zeros(2)
        cov_l = np.zeros(4)

        final_ts_obj = ghmm.EmissionSequence(self.F, X_test + [x1, x2])

        try:
            (alpha, scale) = self.ml.forward(final_ts_obj)
        except:
            print "No alpha is available !!"
            sys.exit()

        alpha = np.array(alpha)

        for j in xrange(self.nState):

            [[mu1, mu2], [cov11, cov12, cov21, cov22]] = self.B[j]

            mu_l[0] = x1
            mu_l[1] += alpha[n / self.nEmissionDim, j] * (mu2 + cov21 / cov11 *
                                                          (x1 - mu1))
            ## cov_l[0] += (cov11)*(total**2)
            ## cov_l[1] += (cov12)*(total**2)
            ## cov_l[2] += (cov21)*(total**2)
            ## cov_l[3] += (cov22)*(total**2)

        return mu_l, cov_l
    def test(self, model, ts_obj):

        # Find Viterbi Path
        final_ts_obj = ghmm.EmissionSequence(self.F,ts_obj.tolist())
        path_obj = model.viterbi(final_ts_obj)
        
        return path_obj
Ejemplo n.º 15
0
    def _predict_next(self):
        """@todo: Docstring for _predict_next.
        :returns: @todo

        """
        a_init = normalize_stoch_map(np.random.rand(self._n_hid, self._n_hid))
        b_init = normalize_stoch_map(
            np.random.rand(self._n_hid, self._n_sym**2))
        pi_init = normalize_stoch_map(np.random.rand(self._n_hid))
        hmm = gh.HMMFromMatrices(self._alphab,
                                 gh.DiscreteDistribution(self._alphab), a_init,
                                 b_init, pi_init)
        obs = gh.EmissionSequence(self._alphab, self._memory)
        hmm.baumWelch(obs)

        alpha = hmm.forward(obs)[0][-1]
        trans = hmm.asMatrices()[0]
        alpha = np.dot(alpha, trans)
        next_moves_dist = np.zeros(self._n_sym**2)
        for i in range(self._n_hid):
            next_moves_dist += np.asarray(hmm.getEmission(i)) * alpha[i]
        next_moves_dist = next_moves_dist[self._conversion_array]
        next_move = np.argmax(np.sum(next_moves_dist, axis=0))

        return np.where(self._rules[next_move] == -1)[0][0]
Ejemplo n.º 16
0
    def test_forward_against_ghmm(self):
        from .util import ghmm_from_discrete_hmm
        import ghmm
        hmm = self.new_hmm(random_transitions=True, random_emissions=True)
        hmm_reference = ghmm_from_discrete_hmm(hmm)
        observation_size = 10
        observed = np.random.choice(range(self.nSymbols),
                                    size=observation_size).tolist()
        seq = ghmm.EmissionSequence(hmm_reference.emissionDomain, observed)
        forward, scale = hmm.forward(observed)

        # remember that we have to convert stuff from ghmm to log scale
        forward_reference, scale_reference = map(np.array,
                                                 hmm_reference.forward(seq))
        print "Forward reference (scaled):\n", forward_reference
        print "Scale reference: {}".format(scale_reference)
        # for i, c in enumerate(scale_reference):
        #     forward_reference_log[i] += sum(np.log(scale_reference[:i + 1]))
        # print "Forward reference (unscaled):\n", np.exp(forward_reference_log)

        print "Forward:\n", forward
        print "Scale:\n", scale

        assert np.allclose(forward, forward_reference)
        assert np.allclose(scale, scale_reference)
Ejemplo n.º 17
0
    def conditional_prob(self, x):
        '''
        Input
        @ x: dim x length
        Output
        @ A list of conditional probabilities P(x_t|x_s,lambda)

        Only single sample works
        '''
        from scipy.stats import norm, entropy

        # logp from all features
        X_test = util.convert_sequence2(x, emission=False)
        X_test = np.squeeze(X_test)
        final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist())
        logp_all = self.ml.loglikelihood(final_ts_obj)

        # feature-wise conditional probability
        cond_prob = []
        for i in xrange(self.nEmissionDim):  # per feature

            B = copy.copy(self.B)
            for j in xrange(self.nState):
                B[j][0] = [b for idx, b in enumerate(B[j][0]) if idx != i]
                B_arr = copy.copy(B[j][1])
                B_arr = np.array(B_arr).reshape(
                    (self.nEmissionDim, self.nEmissionDim))
                B_arr = np.delete(B_arr, (i), axis=0)
                B_arr = np.delete(B_arr, (i), axis=1)
                B[j][1] = B_arr.flatten().tolist()
            ml_src = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \
                                          self.A, B, self.pi)

            # logp from remains
            X_test = util.convert_sequence2([ x[j] for j in xrange(len(x)) if j != i ], \
                                            emission=False)
            X_test = np.squeeze(X_test)
            final_ts_obj = ghmm.EmissionSequence(self.F, X_test.tolist())
            logp_src = ml_src.loglikelihood(final_ts_obj)

            cond_prob.append(logp_all - logp_src)

            if np.isnan(cond_prob[-1]) or np.isinf(cond_prob[-1]):
                print "NaN in conditional probabilities: ", np.shape(x)
                return None

        return np.array(cond_prob)
Ejemplo n.º 18
0
def _sequence_from_matrix(m):
    # Conversion happens as follows: data is a n x m matrix, where n is the number of
    # samples and m is the number of features per sample. Multivariate data in ghmm is
    # represented as a single list, where the samples are unrolled. Hence the resulting
    # data has the following structure: [x_11, x_12, x_13, x21, x22, x23, ...] where m = 3.
    # Source: http://sourceforge.net/p/ghmm/mailman/message/20578788/
    unrolled = m.ravel().tolist()
    seq = impl.EmissionSequence(impl.Float(), unrolled)
    return seq
Ejemplo n.º 19
0
    def test_backward_against_ghmm(self):
        from kerehmm.test.util import ghmm_from_gaussian_hmm
        import ghmm
        hmm = self.new_hmm(nDimensions=1,
                           random_emissions=True,
                           random_transitions=True,
                           lower_bounds=[0],
                           upper_bounds=[10])
        hmm_reference = ghmm_from_gaussian_hmm(hmm)
        observation_size = 5
        observed = [
            np.random.randint(0, 10) for _ in range(self.nDimensions)
            for i in range(observation_size)
        ]
        seq = ghmm.EmissionSequence(hmm_reference.emissionDomain,
                                    np.array(observed).flatten().tolist())

        # remember that we have to convert stuff from ghmm to log scale
        _, scale_reference = map(np.array, hmm_reference.forward(seq))
        # print "Forward referece", forward
        print "Scale reference", scale_reference

        # this is the reference backward array, untransformed (scaled)
        backward_reference = np.array(
            hmm_reference.backward(seq, scalingVector=scale_reference))
        print "Backward reference (scaled)", backward_reference

        # unscale the reference array
        # get the product of scale_t,scale_t+1,...,scale_T for each t.
        # coefficients = np.array([np.prod(scale_reference[i:]) for i, _ in enumerate(scale_reference)])
        coefficients = np.array([
            np.multiply.reduce(scale_reference[t + 1:])
            for t, _ in enumerate(scale_reference)
        ])
        print "Reference coefficients:", coefficients

        # multiply each backwards_reference[i] by coefficients[i]
        backward_reference[:] = (np.expand_dims(coefficients, axis=1) *
                                 backward_reference)

        # test shape
        print "Backward reference (unscaled)", backward_reference

        # this is our backward array, log transformed
        backward = hmm.backward(observed)

        print "Backward", np.exp(backward)

        assert backward.shape == backward_reference.shape

        # test values
        # print "Diff:", np.exp(backward) - backward_reference
        backward_unscaled = np.exp(backward)
        assert np.allclose(backward_unscaled, backward_reference)
Ejemplo n.º 20
0
 def decodeHMM(m, hmmStates):
     ''' Decode HMM. '''
     print >> sys.stderr, printTime(), "Decode HMM for each chromosome."
     sigma = ghmm.IntegerRange(0, 2)
     for chrom in hmmStates:
         print >> sys.stderr, printTime(), chrom
         state, score = m.viterbi(
             ghmm.EmissionSequence(sigma, hmmStates[chrom]))
         hmmStates[chrom] = state
     print >> sys.stderr, printTime(), "Decode HMM finished."
     print >> sys.stderr
Ejemplo n.º 21
0
    def predict(self, X):
        '''
        Input
        @ X: dimension x sample x length #samples x known steps
        Output
        @ observation distribution: mu, var #samples x 1 [list]        
        '''

        # sample x some length
        X_test = util.convert_sequence(X, emission=False)

        mu_l = []
        cov_l = []

        for i in xrange(len(X_test)):

            # Past profile
            final_ts_obj = ghmm.EmissionSequence(self.F, X_test[i].tolist())

            try:
                # alpha: X_test length y #latent States at the moment t when state i is ended
                #        test_profile_length x number_of_hidden_state
                (alpha, scale) = self.ml.forward(final_ts_obj)
                alpha = np.array(alpha)
                scale = np.array(scale)
            except:
                print "No alpha is available !!"
                sys.exit()
                ## continue

            f = lambda x: round(x, 12)
            for j in range(len(alpha)):
                alpha[j] = map(f, alpha[j])
            alpha[-1] = map(f, alpha[-1])

            n = len(X_test[i])
            t_mu = np.zeros(self.nEmissionDim)
            t_cov = np.zeros(self.nEmissionDim * self.nEmissionDim)
            t_sum = 0.0
            for j in xrange(self.nState):  # N+1

                total = np.sum(
                    self.A[:, j] *
                    alpha[n / self.nEmissionDim - 1, :])  #* scaling_factor
                [mu, cov] = self.B[j]

                t_mu += np.array(mu) * total
                t_cov += np.array(cov) * (total**2)
                t_sum += total

            mu_l.append(t_mu.tolist())
            cov_l.append(t_cov.tolist())

        return mu_l, cov_l
Ejemplo n.º 22
0
    def predict(self, X):
        X = np.squeeze(X)
        X_test = X.tolist()

        mu_l = np.zeros(3)
        cov_l = np.zeros(9)

        print self.F
        final_ts_obj = ghmm.EmissionSequence(self.F,
                                             X_test)  # is it neccessary?

        try:
            # alpha: X_test length y # latent States at the moment t when state i is ended
            # test_profile_length x number_of_hidden_state
            (alpha, scale) = self.ml.forward(final_ts_obj)
            alpha = np.array(alpha)
        except:
            print "No alpha is available !!"

        f = lambda x: round(x, 12)
        for i in range(len(alpha)):
            alpha[i] = map(f, alpha[i])
        alpha[-1] = map(f, alpha[-1])

        n = len(X_test)
        pred_numerator = 0.0

        for j in xrange(self.nState):  # N+1
            total = np.sum(
                self.A[:, j] *
                alpha[n / self.nEmissionDim - 1, :])  #* scaling_factor
            [[mu1, mu2, mu3],
             [cov11, cov12, cov13, cov21, cov22, cov23, cov31, cov32,
              cov33]] = self.B[j]

            ## print mu1, mu2, cov11, cov12, cov21, cov22, total
            pred_numerator += total

            mu_l[0] += mu1 * total
            mu_l[1] += mu2 * total
            mu_l[2] += mu3 * total
            cov_l[0] += cov11 * (total**2)
            cov_l[1] += cov12 * (total**2)
            cov_l[2] += cov13 * (total**2)
            cov_l[3] += cov21 * (total**2)
            cov_l[4] += cov22 * (total**2)
            cov_l[5] += cov23 * (total**2)
            cov_l[6] += cov31 * (total**2)
            cov_l[7] += cov32 * (total**2)
            cov_l[8] += cov33 * (total**2)

        return mu_l, cov_l
Ejemplo n.º 23
0
    def loglikelihood(self, X):
        X = np.squeeze(X)
        X_test = X.tolist()        

        final_ts_obj = ghmm.EmissionSequence(self.F, X_test)

        try:    
            p = self.ml.loglikelihood(final_ts_obj)
        except:
            if self.verbose: print 'Likelihood error!!!!'
            sys.exit()

        return p
Ejemplo n.º 24
0
def _removeOutliers(models, trainData, outliers):
    needTrain = False

    for i in range(len(models)):

        mean = 0
        variance = 0

        #Calculate model mean
        for tmp in trainData[i]:
            eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp)
            a = abs(models[i].loglikelihood(eSeq))
            mean += a

        try:
            mean /= (len(trainData[i]) * 1.0)
        except:
            continue

        #Calculate the model variance
        for tmp in trainData[i]:
            eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp)
            v = abs(models[i].loglikelihood(eSeq))
            variance += (mean - v)**2

        variance /= (len(trainData[i]) * 1.0)
        std = variance**0.5

        for tmp in trainData[i]:
            eSeq = ghmm.EmissionSequence(ghmm.Float(), tmp)
            v = abs(models[i].loglikelihood(eSeq))
            if (v - mean) > (2 * std):
                trainData[i].remove(tmp)
                outliers.append(tmp)
                needTrain = True

    if needTrain:
        models = _trainModels(trainData, models)
Ejemplo n.º 25
0
def computeLikelihood(idx, A, B, pi, F, X, nEmissionDim, nState, startIdx=1, \
                      bPosterior=False, converted_X=False, cov_type='full'):
    '''
    This function will be deprecated. Please, use computeLikelihoods.
    '''

    if nEmissionDim >= 2:
        ml = ghmm.HMMFromMatrices(F, ghmm.MultivariateGaussianDistribution(F),
                                  A, B, pi)
        if cov_type == 'diag' or cov_type.find('diag') >= 0:
            ml.setDiagonalCovariance(1)
    else:
        ml = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F), A, B, pi)

    if converted_X is False:
        X_test = util.convert_sequence(X, emission=False)
        X_test = np.squeeze(X_test)
        X_test = X_test.tolist()
    else:
        X_test = X

    l_idx = []
    l_likelihood = []
    l_posterior = []

    for i in xrange(startIdx, len(X_test) / nEmissionDim):
        final_ts_obj = ghmm.EmissionSequence(F, X_test[:i * nEmissionDim])

        try:
            logp = ml.loglikelihood(final_ts_obj)
            if bPosterior: post = np.array(ml.posterior(final_ts_obj))
        except:
            print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?"

            l_idx.append(i)
            l_likelihood.append(-100000000)
            if bPosterior:
                if len(l_posterior) == 0: l_posterior.append(list(pi))
                else: l_posterior.append(l_posterior[-1])
            ## return False, False # anomaly
            continue

        l_idx.append(i)
        l_likelihood.append(logp)
        if bPosterior: l_posterior.append(post[i - 1])

    if bPosterior:
        return idx, l_idx, l_likelihood, l_posterior
    else:
        return idx, l_idx, l_likelihood
Ejemplo n.º 26
0
    def anomaly_check(self, X1, X2=None, X3=None, ths_mult=None):
        if self.nEmissionDim == 1: X_test = np.array([X1])
        else: X_test = self.convert_sequence(X1, X2, X3, emission=False)

        try:
            final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist())
            logp = self.ml.loglikelihood(final_ts_obj)
        except:
            print "Too different input profile that cannot be expressed by emission matrix"
            return -1, 0.0  # error

        try:
            post = np.array(self.ml.posterior(final_ts_obj))
        except:
            print "Unexpected profile!! GHMM cannot handle too low probability. Underflow?"
            return 1.0, 0.0  # anomaly

        n = len(np.squeeze(X1))

        # Find the best posterior distribution
        min_dist = 100000000
        min_index = 0
        print 'Loglikelihood', logp
        print 'Last posterior', post[n - 1]
        print 'Computing entropies'
        for j in xrange(self.nGaussian):
            dist = entropy(post[n - 1], self.l_statePosterior[j])
            print 'Index:', j, 'Entropy:', dist
            if min_dist > dist:
                min_index = j
                min_dist = dist

        print 'Computing anomaly'
        print logp
        print self.ll_mu[min_index]
        print self.ll_std[min_index]

        if (type(ths_mult) == list or type(ths_mult) == np.ndarray
                or type(ths_mult) == tuple) and len(ths_mult) > 1:
            err = logp - (self.ll_mu[min_index] +
                          ths_mult[min_index] * self.ll_std[min_index])
        else:
            err = logp - (self.ll_mu[min_index] +
                          ths_mult * self.ll_std[min_index])

        print 'Error', err

        if err < 0.0: return 1.0, 0.0  # anomaly
        else: return 0.0, err  # normal
Ejemplo n.º 27
0
 def trainHMM(hmmState):
     ''' Train HMM with the given chromosome. '''
     print >> sys.stderr, printTime(), "Train HMM with one chromosome."
     T = [[0.9, 0.1], [0.1, 0.9]]
     e1 = [0.1, 0.9]
     e0 = [0.9, 0.1]
     E = [e0, e1]
     pi = [0.9, 0.1]  # initial 10% are peak?
     sigma = ghmm.IntegerRange(0, 2)  # 0, 1
     m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), T, E,
                              pi)
     m.baumWelch(ghmm.EmissionSequence(sigma, hmmState))
     print >> sys.stderr, printTime(), "Train HMM finished."
     print >> sys.stderr
     return m
Ejemplo n.º 28
0
    def state_clustering(self, X1, X2, X3, X4):
        n,m = np.shape(X1)

        print n,m
        x   = np.arange(0., float(m))*(1./43.)
        state_mat  = np.zeros((self.nState, m*n))
        likelihood_mat = np.zeros((1, m*n))

        count = 0           
        for i in xrange(n):

            for j in xrange(1,m):            

                x_test1 = X1[i:i+1,:j]
                x_test2 = X2[i:i+1,:j]            
                x_test3 = X3[i:i+1,:j]            
                x_test4 = X4[i:i+1,:j]            
                X_test = self.convert_sequence(x_test1, x_test2, x_test3, x_test4, emission=False)

                final_ts_obj = ghmm.EmissionSequence(self.F, X_test[0].tolist())
                ## path,_    = self.ml.viterbi(final_ts_obj)        
                post      = self.ml.posterior(final_ts_obj)
                logp      = self.ml.loglikelihood(final_ts_obj)

                state_mat[:, count] = np.array(post[j-1])
                likelihood_mat[0,count] = logp
                count += 1

        # k-means
        init_center = np.eye(self.nState, self.nState)
        self.km = KMeans(self.nState, init=init_center)
        idx_list = self.km.fit_predict(state_mat.transpose())

        # mean and variance of likelihoods
        l = []
        for i in xrange(self.nState):
            l.append([])

        for i, idx in enumerate(idx_list):
            l[idx].append(likelihood_mat[0][i]) 

        l_mean = []
        l_std = []
        for i in xrange(self.nState):
            l_mean.append( np.mean(l[i]) )
            l_std.append( np.std(l[i]) )
                
        return l_mean, l_std
Ejemplo n.º 29
0
        def viterbi(self, hmm, obj):
            hmm_ = self._get_hmm(hmm)

            if isinstance(obj, SequenceSet):
                obj = [array_flatten(s[:]) for s in obj]
                obj = ghmm.SequenceSet(DOMAIN, obj)
                res = hmm_.viterbi(obj)
                # ghmm returns a scalar even though a sequence set was passed
                # if length == 1 but we want an array
                if len(obj) == 1:
                    res = [[res[0]], [res[1]]]
            else:
                obj = ghmm.EmissionSequence(DOMAIN, array_flatten(obj[:]))
                res = hmm_.viterbi(obj)

            return res
Ejemplo n.º 30
0
    def dist(self, data):
        """dist(pattern)

        Calculate the distance between piece of data and the model.
        This is an absurd distance function as the closest distance is 
        the greatest value of this function.  Also the function can be 
        greater than or less than zero.
        """
        eSeq = ghmm.EmissionSequence(ghmm.Float(), data)
        tmp = self.model.loglikelihood(eSeq)

        try:
            tmp/1
        except Exception, e:
            print "In exception"
            tmp = -1000