def train(self, model, flag):
     index = np.size(self.Fmat_train,0)/np.size(categories)
     if flag == 'Missed':
         final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[0:index])
     elif flag == 'Good':
         final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[index:2*index])
     elif flag == 'High':
         final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[2*index:3*index])
     elif flag == 'Caught':
         final_ts = ghmm.SequenceSet(self.F,self.Fmat_train[3*index:4*index])
             
     model.baumWelch(final_ts)
     return model
Beispiel #2
0
    def train(self, training_file, epsilon=0.0001, max_iter=500):
        """
        Train the TFFM using the fasta sequences to learn emission and
        transition probabilities.

        :note: The training of the underlying HMM is made using the Baum-Welsh
            algorithm.

        :arg training_file: The fasta file of the sequences to train the TFFM
            on.
        :type training_file: str
        :arg epsilon: The least relative improvement cut-off in likelihood
            compared to the previous iteration of the Baum-Welsh algorithm
            (default: 0.0001).
        :type epsilon: float
        :arg max_iter: The maximum number of iteration of the Baum-Welsh
            algorithm to reestimate the probabilities (default: 500).
        :type max_iter: int

        """

        assert(os.path.isfile(training_file))
        # Only upper case is allowed in the ALPHABET, need to convert
        sequences = []
        for record in SeqIO.parse(training_file, "fasta"):
            sequence = record.seq.upper()
            # Only considering sequences with ACGTs
            if not re.search("[^AGCT]", str(sequence)):
                sequences.append(sequence)
        training_sequences = ghmm.SequenceSet(ghmm.Alphabet(ALPHABET),
                                              sequences)
        # Need to give the same weight to all the sequences since it does not
        # seem to be done by default by ghmm.
        utils.set_sequences_weight(training_sequences, 1.0)
        self.baumWelch(training_sequences, max_iter, epsilon)
Beispiel #3
0
    def _get_posterior_proba(self, sequence_split):
        """
        Get the posterior probabilities at each nucleotide position given the
        TFFM.

        :arg sequence_split: The sequence splitted in subsequences to not
            consider non ACGT nucleotides.
        :type sequence_split: list

        :returns: The posterior probabilities at each position of the sequence.
        :rtype: list of list

        :note: One example of a sequence_split is ["ACT", "N", "ATC"].

        """

        ghmm_extended_alphabet = ghmm.Alphabet(EXTENDED_ALPHABET)
        posterior_proba = []
        # null probabilities for non ACGT nucleotides.
        null_proba = [0.] * self.N
        for sequence in sequence_split:
            if re.match("[ACGT]", sequence):
                emission_sequence = ghmm.SequenceSet(ghmm_extended_alphabet,
                                                     [sequence])[0]
                posterior_proba.extend(self.posterior(emission_sequence))
            else:
                for __ in xrange(len(sequence)):
                    posterior_proba.append(null_proba)
        return posterior_proba
    def __gestures_markov_sequences_from_demos(self, gesture):
        emission_sequences = []
        for demo_index in xrange(gesture.demonstration_count):
            training_data = gesture.get_training_data(demo_index=demo_index)
            seq = self.__get_sequence_values_from_training_data(training_data)
            emission_sequences.append(seq)

        return ghmm.SequenceSet(self.F, emission_sequences)
Beispiel #5
0
    def _create_sequence_set(self, qsr_seq, symbols):
        """Creating a sequence set for training

        :param qsr_seq: the observation seqence of symbols according to the alphabet as a list of lists
        :param symbols: the alphabet of possible symbols

        :return: the sequence set for the given observations
        """
        return gh.SequenceSet(symbols, qsr_seq)
Beispiel #6
0
 def trainModel(self, trainingData):
     # Expecting training data in the form of a list of malwareSignatures
     if len( trainingData ) < 1:
         prettyPrint("Empty training set provided", "error")
         return False
     # Now use the Baum-Welch algorithm
     self.ghmmModel.baumWelch(ghmm.SequenceSet(self.ghmmModel.emissionDomain, trainingData))
     #print self.ghmmModel
     return True
    def fit(self, X_train, A=None, B=None, pi=None, B_dict=None, verbose=False):

        if A is None:        
            if verbose: print "Generate new A matrix"                
            # Transition probability matrix (Initial transition probability, TODO?)
            A = self.init_trans_mat(self.nState).tolist()

        if B is None:
            if verbose: print "Generate new B matrix"                                            
            # We should think about multivariate Gaussian pdf.        
            self.mu, self.sig = self.vectors_to_mean_sigma(X_train, self.nState)

            # Emission probability matrix
            B = np.hstack([self.mu, self.sig]).tolist() # Must be [i,:] = [mu, sig]
                
        if pi is None:            
            # pi - initial probabilities per state 
            ## pi = [1.0/float(self.nState)] * self.nState
            pi = [0.] * self.nState
            pi[0] = 1.0

        # HMM model object
        self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), A, B, pi)
        
        ## print "Run Baum Welch method with (samples, length)", X_train.shape
        train_seq = X_train.tolist()
        final_seq = ghmm.SequenceSet(self.F, train_seq)        
        self.ml.baumWelch(final_seq, 10000)

        [self.A,self.B,self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)

        ## self.mean_path_plot(mu[:,0], sigma[:,0])        
        ## print "Completed to fitting", np.array(final_seq).shape
        
        # state range
        self.state_range = np.arange(0, self.nState, 1)

        # Pre-computation for PHMM variables
        self.mu_z   = np.zeros((self.nState))
        self.mu_z2  = np.zeros((self.nState))
        self.mu_z3  = np.zeros((self.nState))
        self.var_z  = np.zeros((self.nState))
        self.sig_z3 = np.zeros((self.nState))
        for i in xrange(self.nState):
            zp             = self.A[i,:]*self.state_range
            self.mu_z[i]   = np.sum(zp)
            self.mu_z2[i]  = self.mu_z[i]**2
            #self.mu_z3[i]  = self.mu_z[i]**3
            self.var_z[i]  = np.sum(zp*self.state_range) - self.mu_z[i]**2
Beispiel #8
0
    def toSequenceSetBlocks(self, peptideList):
        ''' Converts a list of peptides given as strings to GHMM format.

        As GHMM has a limitation of max 1,500,000 sequences per sequence set,
        longer peptide lists are split into blocks. The block size is
        configurable via self.sequencesPerBlock to facilitate testing.

        Returns a list of sequence sets, preserving the original order
        of the peptides.
        '''

        # split into blocks
        lenPeptides = len(peptideList)
        numFullBlocks = lenPeptides // self.sequencesPerBlock
        lenLastBlock = lenPeptides % self.sequencesPerBlock

        sequenceSets = []

        # full blocks (if any)
        for i in range(0, numFullBlocks):
            rangeStart = i * self.sequencesPerBlock
            rangeEnd = rangeStart + self.sequencesPerBlock
            sequenceBlock = ghmm.SequenceSet(
                self.alphabet,
                [list(p) for p in peptideList[rangeStart:rangeEnd]])
            sequenceSets.append(sequenceBlock)

        # the last partial block (if any)
        rangeStart = numFullBlocks * self.sequencesPerBlock
        rangeEnd = lenPeptides
        if (lenLastBlock > 0):
            sequenceBlock = ghmm.SequenceSet(
                self.alphabet,
                [list(p) for p in peptideList[rangeStart:rangeEnd]])
            sequenceSets.append(sequenceBlock)

        return sequenceSets
Beispiel #9
0
        def viterbi(self, hmm, obj):
            hmm_ = self._get_hmm(hmm)

            if isinstance(obj, SequenceSet):
                obj = [array_flatten(s[:]) for s in obj]
                obj = ghmm.SequenceSet(DOMAIN, obj)
                res = hmm_.viterbi(obj)
                # ghmm returns a scalar even though a sequence set was passed
                # if length == 1 but we want an array
                if len(obj) == 1:
                    res = [[res[0]], [res[1]]]
            else:
                obj = ghmm.EmissionSequence(DOMAIN, array_flatten(obj[:]))
                res = hmm_.viterbi(obj)

            return res
    def train(self, model, flag, accum_idx):
        input_data = []
        start = 0
        end = 0
        if flag == exp_list[0]:
            start = 0
            end = accum_idx[0]
        elif flag == exp_list[1]:
            start = accum_idx[0]
            end = accum_idx[1]
        elif flag == exp_list[2]:
            start = accum_idx[1]
            end = accum_idx[2]
        elif flag == exp_list[3]:
            start = accum_idx[2]
            end = accum_idx[3]
        for i in range(start, end):
            input_data.append(self.Fmat_train[i].flatten().tolist())

        '''
        if (flag == exp_list[1]):
            pp.figure(3)
            print len(self.Fmat_train), start, end
            for i in range(start, end):
                #pp.plot(np.transpose(self.Fmat_train[i])[0], np.transpose(self.Fmat_train[i])[1], 'o')
                pp.plot(np.transpose(self.Fmat_train[i])[0])
            pp.show()
            abc
        '''

        final_ts = ghmm.SequenceSet(self.F, input_data)

        #final_ts = model.sample(10, 50)

        # print the input data
        print final_ts

        model.baumWelch(final_ts)
        # print the optimized model
        #print model
        if math.isnan(model.getInitial(0)):
            print 'model has nan'
            abc
        #abc
        return model
Beispiel #11
0
    def train(self, X):
        """Uses GHMM's implementation of Baum-Welch to train an HMM"""
        try:
            if len(X) < 1:
                prettyPrint("Empty training set provided", "warning")
                return False
            # Now use the Baum-Welch algorithm
            self.ghmmModel.baumWelch(
                ghmm.SequenceSet(self.ghmmModel.emissionDomain, X))
            self.isTrained = True
            if verboseON():
                print "Trained model: %s" % self.ghmmModel

        except Exception as e:
            prettyPrintError(e)
            return False

        return True
Beispiel #12
0
    def test_viterbi_against_hmm(self):
        from kerehmm.test.util import ghmm_from_discrete_hmm
        import ghmm

        hmm = self.new_hmm()
        hmm.setup_strict_left_to_right(set_emissions=True)
        domain = ghmm.Alphabet(range(hmm.alphabetSize))
        hmm_reference = ghmm_from_discrete_hmm(hmm)
        seq = list(range(self.nSymbols))
        print "True path and emission: {}".format(seq)
        true_path = seq
        reference_path, reference_prob = hmm_reference.viterbi(
            ghmm.SequenceSet(domain, [seq]))
        path, prob = hmm.viterbi_path(seq)
        print "Reference path: {}".format(reference_path)
        print "Calculated path: {}".format(path)
        print "Reference prob: {}, Calculated prob: {}".format(
            reference_prob, prob)
        assert np.all(np.equal(true_path, reference_path))
        assert np.all(np.equal(true_path, path))
        assert np.isclose(prob, reference_prob)
Beispiel #13
0
def perform_optimization(hidden_mm, trajs, lag_time, sliding_window=True):
    """Optimize a hidden markov model given a list of trajectories.

    Use the Baum-Welch algorithm for learning the transition matrix, fixing
    emission probabilities.
    """

    # Domains for our multivariate gaussians
    domain = hidden_mm.emissionDomain

    # Do sliding window
    if sliding_window:
        # A naive way of doing this is by making many trajectories
        slides = xrange(lag_time)
        lagged_trajs = list()
        for i in xrange(len(trajs)):
            traj = trajs[i]
            for slide in slides:
                lagged_trajs.append(traj[slide::lag_time])
    else:
        lagged_trajs = [t[::lag_time] for t in trajs]

    # Prepare the trajectories by flattening them to 1D
    prepared_trajs = [t.flatten().tolist() for t in lagged_trajs]
    # Build the c-style sequences object manually
    (seq_c, lengths) = ghmmhelper.list2double_matrix(prepared_trajs)
    lengths_c = ghmmwrapper.list2int_array(lengths)
    cseq = ghmmwrapper.ghmm_cseq(seq_c, lengths_c, len(trajs))

    # Make a SequenceSet wrapper around the c-style object
    train_seq = ghmm.SequenceSet(domain, cseq)
    # Perform the Baum Welch optimization
    likelihood = hidden_mm.baumWelch(train_seq,
                                     nrSteps=10000000,
                                     loglikelihoodCutoff=1.0e-5)

    print "Final baum welch likelihood: {}".format(likelihood)

    return likelihood, hidden_mm
    def partial_fit(self, xData, learningRate=0.2, nrSteps=1, max_iter=100):
        ''' Online update of HMM using online Baum-Welch algorithm
        '''

        X = [np.array(data) for data in xData]
        nData = len(X[0])

        # print 'Creating Training Data'
        X_train = util.convert_sequence(X)  # Training input
        X_train = X_train.tolist()

        if self.verbose:
            print 'Run Baum Welch method with (samples, length)', np.shape(
                X_train)
        if learningRate < 1e-5: learningRate = 1e-5

        final_seq = ghmm.SequenceSet(self.F, X_train)
        for i in xrange(max_iter):
            ret = self.ml.baumWelch(final_seq,
                                    nrSteps=nrSteps,
                                    learningRate=learningRate)

            if np.isnan(ret):
                print 'Baum Welch return:', ret
                return 'Failure'
            if i > 0:
                if abs(last_ret - ret) < 1.0:
                    print "Partial fitting is converged to ", ret, " from ", last_ret
                    break
            last_ret = ret

        print 'Baum Welch return:', ret / float(nData)

        [self.A, self.B, self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)

        return ret
Beispiel #15
0
def _trainModels(tdata, models):
    """Train models using every data element designated from the _assign
    functions.  
    
    Note: this function is independent from the type of data split used.
    """
    for i in range(len(models)):

        #Create a sequence set used for training from the multiple observations
        seqSet = ghmm.SequenceSet(ghmm.Float(), [])
        for tmpData in tdata[i]:
            seqSet.merge(ghmm.EmissionSequence(ghmm.Float(), tmpData))

        #Make average sequence
        s = numpy.array(tdata[i])
        nm = hmmsup.obsToModel(s.mean(axis=0), max(s.std(axis=0)))
        nm.normalize()
        nm.baumWelch(seqSet)
        models[i] = nm
        #models[i].baumWelch(seqSet)#, loglikelihoodCutoff = 0.000001)
        hmmsup.normalizeAMat(models[i])
        hmmsup.normalizePiMat(models[i])
    return models
    def fit(self, xData1, A=None, B=None, pi=None, cov_mult=[1.0]*1, verbose=False, \
            ml_pkl='ml_temp_1d.pkl', use_pkl=False):
        ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl)
        X1 = np.array(xData1)

        if A is None:
            if verbose: print "Generating a new A matrix"
            # Transition probability matrix (Initial transition probability, TODO?)
            A = self.init_trans_mat(self.nState).tolist()

            # print 'A', A

        if B is None:
            if verbose: print "Generating a new B matrix"
            # We should think about multivariate Gaussian pdf.

            mu, sig = self.vectors_to_mean_sigma(X1, self.nState)
            B = np.vstack([mu, sig * cov_mult[0]
                           ]).T.tolist()  # Must be [i,:] = [mu, sig]

        if pi is None:
            # pi - initial probabilities per state
            ## pi = [1.0/float(self.nState)] * self.nState
            pi = [0.0] * self.nState
            pi[0] = 1.0

        # print 'Generating HMM'
        # HMM model object
        self.ml = ghmm.HMMFromMatrices(self.F,
                                       ghmm.GaussianDistribution(self.F), A, B,
                                       pi)
        X_train = X1.tolist()

        print 'Run Baum Welch method with (samples, length)', np.shape(X_train)
        final_seq = ghmm.SequenceSet(self.F, X_train)
        ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0)
        ret = self.ml.baumWelch(final_seq, 10000)
        print 'Baum Welch return:', ret
        if np.isnan(ret): return 'Failure'

        [self.A, self.B, self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)

        #--------------- learning for anomaly detection ----------------------------
        [A, B, pi] = self.ml.asMatrices()
        n, m = np.shape(X1)
        self.nGaussian = self.nState

        # Get average loglikelihood threshold wrt progress
        self.std_coff = 1.0
        g_mu_list = np.linspace(0, m - 1,
                                self.nGaussian)  #, dtype=np.dtype(np.int16))
        g_sig = float(m) / float(self.nGaussian) * self.std_coff

        # print 'g_mu_list:', g_mu_list
        # print 'g_sig:', g_sig

        ######################################################################################
        if os.path.isfile(ml_pkl) and use_pkl:
            with open(ml_pkl, 'rb') as f:
                d = pickle.load(f)
                self.l_statePosterior = d[
                    'state_post']  # time x state division
                self.ll_mu = d['ll_mu']
                self.ll_std = d['ll_std']
        else:
            if self.cluster_type == 'time':
                print 'Begining parallel job'
                r = Parallel(n_jobs=-1)(delayed(learn_likelihoods_progress)(
                    i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim,
                    g_mu_list[i], g_sig, self.nState)
                                        for i in xrange(self.nGaussian))
                # r = [self.learn_likelihoods_progress_par(i, n, m, A, B, pi, X_train, g_mu_list[i], g_sig) for i in xrange(self.nGaussian)]
                print 'Completed parallel job'
                l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r)
            elif self.cluster_type == 'state':
                self.km = None
                self.ll_mu = None
                self.ll_std = None
                self.ll_mu, self.ll_std = self.state_clustering(X1)
                path_mat = np.zeros((self.nState, m * n))
                likelihood_mat = np.zeros((1, m * n))
                self.l_statePosterior = None

            d = dict()
            d['state_post'] = self.l_statePosterior
            d['ll_mu'] = self.ll_mu
            d['ll_std'] = self.ll_std
            ut.save_pickle(d, ml_pkl)
Beispiel #17
0
    def fit(self,
            xData1,
            xData2,
            xData3,
            A=None,
            B=None,
            pi=None,
            cov_mult=[100.0] * 9,
            verbose=False,
            ml_pkl='ml_temp.pkl',
            use_pkl=False):
        ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl)
        X1 = np.array(xData1)
        X2 = np.array(xData2)
        X3 = np.array(xData3)

        if A is None:
            if verbose: print "Generating a new A matrix"
            # Transition probability matrix (Initial transition probability, TODO?)
            A = self.init_trans_mat(self.nState).tolist()

            # print 'A', A

        if B is None:
            if verbose: print "Generating a new B matrix"
            # We should think about multivariate Gaussian pdf.

            mu1, mu2, mu3, cov = self.vectors_to_mean_cov(
                X1, X2, X3, self.nState)
            cov[:, 0, 0] *= cov_mult[0]  #1.5 # to avoid No convergence warning
            cov[:, 1, 0] *= cov_mult[1]  #5.5 # to avoid No convergence warning
            cov[:, 2, 0] *= cov_mult[2]
            cov[:, 0, 1] *= cov_mult[3]
            cov[:, 1, 1] *= cov_mult[4]
            cov[:, 2, 1] *= cov_mult[5]
            cov[:, 0, 2] *= cov_mult[6]
            cov[:, 1, 2] *= cov_mult[7]
            cov[:, 2, 2] *= cov_mult[8]

            print 'mu1:', mu1
            print 'mu2:', mu2
            print 'mu3:', mu3
            print 'cov', cov

            # Emission probability matrix
            B = [0.0] * self.nState
            for i in range(self.nState):
                B[i] = [[mu1[i], mu2[i], mu3[i]],
                        [
                            cov[i, 0, 0], cov[i, 0, 1], cov[i, 0, 2],
                            cov[i, 1, 0], cov[i, 1, 1], cov[i, 1, 2],
                            cov[i, 2, 0], cov[i, 2, 1], cov[i, 2, 2]
                        ]]
        if pi is None:
            # pi - initial probabilities per state
            ## pi = [1.0/float(self.nState)] * self.nState
            pi = [0.0] * self.nState
            pi[0] = 1.0

        # HMM model object
        self.ml = ghmm.HMMFromMatrices(
            self.F, ghmm.MultivariateGaussianDistribution(self.F), A, B, pi)
        X_train = self.convert_sequence(X1, X2, X3)  # Training input
        X_train = X_train.tolist()

        print 'Run Baum Welch method with (samples, length)', np.shape(X_train)
        final_seq = ghmm.SequenceSet(self.F, X_train)
        ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0)
        ret = self.ml.baumWelch(final_seq, 10000)
        print 'Baum Welch return:', ret

        [self.A, self.B, self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)
        # print 'B\'s shape:', self.B.shape, self.B[0].shape, self.B[1].shape
        # print B[0]
        # print B[1]

        #--------------- learning for anomaly detection ----------------------------
        [A, B, pi] = self.ml.asMatrices()
        n, m = np.shape(X1)
        self.nGaussian = self.nState

        # Get average loglikelihood threshold wrt progress
        self.std_coff = 1.0
        g_mu_list = np.linspace(0, m - 1,
                                self.nGaussian)  #, dtype=np.dtype(np.int16))
        g_sig = float(m) / float(self.nGaussian) * self.std_coff

        print 'g_mu_list:', g_mu_list
        print 'g_sig:', g_sig

        ######################################################################################
        if os.path.isfile(ml_pkl) and use_pkl:
            with open(ml_pkl, 'rb') as f:
                d = pickle.load(f)
                self.l_statePosterior = d[
                    'state_post']  # time x state division
                self.ll_mu = d['ll_mu']
                self.ll_std = d['ll_std']
        else:
            n_jobs = -1
            print 'Begining parallel job'
            r = Parallel(n_jobs=n_jobs)(delayed(learn_likelihoods_progress)(
                i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim,
                g_mu_list[i], g_sig, self.nState)
                                        for i in xrange(self.nGaussian))
            print 'Completed parallel job'
            l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r)

            d = dict()
            d['state_post'] = self.l_statePosterior
            d['ll_mu'] = self.ll_mu
            d['ll_std'] = self.ll_std
            with open(ml_pkl, 'wb') as f:
                pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #18
0
                      [[1.5, -1.0], [1.5, 1.5], [0.75, 0.25]],
                      [[1.5, -1.0], [1.5, 1.5], [0.5, 0.5]],
                      [[1.5, -1.0], [1.5, 1.5], [0.25, 0.75]]]
    # [p1_mean, p2,mean], [p1_std, p2_std], [P(p1), P(p2)]
    model = ghmm.HMMFromMatrices(F, ghmm.GaussianMixtureDistribution(F),
                                 Transitionmatrix, Emissionmatrix, pi)
else:
    Emissionmatrix = [[params['bound'] * 100.0, 1.0], [2.0, 0.5], [1.0, 0.5],
                      [-1.0, 0.5], [-2.0, 0.5]]
    # [mean, std]
    model = ghmm.HMMFromMatrices(F, ghmm.GaussianDistribution(F),
                                 Transitionmatrix, Emissionmatrix, pi)

print('Model before training:')
print(model)
mghmm_train = ghmm.SequenceSet(F, train_set)
model.baumWelch(mghmm_train, 10000, 0.01)
print('Model after training:')
print(model)
model.write(out_hmm)

###------------------------------------------------

###------------------------------------------------
# calculate tail length using the mghmm model and write them to output files
# dict_tl structure: {gene_name : [list of tail lengths]}
pwrite(f_log, '\nCalculating tail-lengths and writing outputs...' + timer())
lst_tl = []  # for storing gene_name, tail-length pairs
counting = 0
rounds = 0
counting_sum = 0
Beispiel #19
0
 def createSequenceSet(self, qtc, symbols):
     return gh.SequenceSet(symbols, qtc)
Beispiel #20
0
 def train(self, hmm, sset):
     sset = [array_flatten(s) for s in sset]
     hmm_ = self._get_hmm(hmm)
     hmm_.baumWelch(ghmm.SequenceSet(DOMAIN, sset))
     hmm.A, hmm.B, hmm.pi = hmm_.asMatrices()
Beispiel #21
0
def _sequence_set_from_list(l):
    # Conversion is similar to _sequence_from_data but here data is a list.
    unrolled = [matrix.ravel().tolist() for matrix in l]
    seq = impl.SequenceSet(impl.Float(), unrolled)
    return seq
Beispiel #22
0
# %%
import ghmm

# %%
sigma = ghmm.IntegerRange(1, 7)

train_seq = ghmm.SequenceSet(
    sigma,
    [[1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1]])

A = [[0.99, 0.01], [0.99, 0.01]]

B = [[1.0 / 6] * 6] * 2

pi = [0.5] * 2

m = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, pi)

m.baumWelch(train_seq, 100000000, 0.000000000000001)

print(m.asMatrices())
# %%
print(map(sigma.external, m.sampleSingle(20)))
# %%
v = m.viterbi(test_seq)
print v

# %%
my_seq = ghmm.EmissionSequence(sigma, [1] * 20 + [6] * 10 + [1] * 40)
print m.viterbi(my_seq)
Beispiel #23
0
def extractHMMFeatures(sourceFiles):
    """ Extracts HMM-similarity features from all files in a given directory """
    allTraces = [] # List to store all traces for the HMM-similarity extraction     
    try:
        for targetFile in sourceFiles:
            if os.path.exists(targetFile.replace(".c", ".seq")):
                instructionAlphaSequence = open(targetFile.replace(".c", ".seq")).read()
                allTraces.append( (instructionAlphaSequence, targetFile.replace(".c", ".hmm"), loadLabelFromFile(targetFile.replace(".c", ".metadata"))[0])) #TODO: Append a tuple of (trace, filename, cluster) for each data sample
        if len(allTraces) < 1:
            prettyPrint("No traces to process for HMM-similarity feature extraction. Skipping", "warning")
        else:
            allClusters = []
            # Retrieve list of clusters
            prettyPrint("Retrieving clusters")
            for trace in allTraces:
                if not trace[2] in allClusters:
                    allClusters.append(trace[2])
            # Gather traces belonging to different clusters
            clusterTraces = []
            for cluster in allClusters:
                currentCluster = []
                for trace in allTraces:
                    if trace[2] == cluster:
                        currentCluster.append(trace[0])
                clusterTraces.append(currentCluster)
                prettyPrint("Retrieved %s instances for cluster %s" % (len(currentCluster), cluster))
            # Should wind up with list of lists each of which depict traces of a cluster
            allHMMs = []
            for cluster in allClusters:
                # Build HMM for each cluster and use it to calculate likelihoods for all instances
                prettyPrint("Building HMM for cluster \"%s\"" % cluster)
                trainingSequences =  clusterTraces[ allClusters.index(cluster) ]
                # Retrieve number of observations
                observations = []
                for sequence in trainingSequences:
                    for o in sequence:
                        if o not in observations:
                            observations.append(o)
                # Prepare matrices for HMM
                A = numpy.random.random((len(allClusters), len(allClusters))).tolist()
                B = numpy.random.random((len(allClusters), len(observations))).tolist()
                Pi = numpy.random.random((len(allClusters),)).tolist()
                sigma = ghmm.Alphabet(observations)
                # Build HMM and train it using Baum-Welch algorithm
                clusterHMM = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, Pi)
                clusterHMM.baumWelch(ghmm.SequenceSet(clusterHMM.emissionDomain, trainingSequences))
                # Add that to list of all HMM's
                allHMMs.append((clusterHMM, observations))
            # Finally, for every trace, calculate the feature vectors
            prettyPrint("Calculating similarity features for traces")
            for trace in allTraces:
                featureVector = []
                for hmm in allHMMs:
                    # Make sure sequences contains observations supported by the current HMM
                    sequence = []
                    for obs in trace[0]:
                        if obs in hmm[1]:
                            sequence.append(obs)
                    # Calculate the likelihood
                    sequence = ghmm.EmissionSequence(ghmm.Alphabet(hmm[1]), sequence)
                    featureVector.append(hmm[0].loglikelihood(sequence))
                    featureFile = open(trace[1], "w")
                    featureFile.write(str(featureVector))
                    featureFile.close()
        #############################################################################

    except Exception as e:
        prettyPrint("Error encoutered: %s" % e, "error")
        return False
        
    return True 
Beispiel #24
0
                # Emission Probabilities
                B = calculate_emission_probabilities(train_data, n_components,
                                                     vocab_len)

                # Initial State Distribution
                pi = [
                    1.0 / n_components
                ] * n_components  # Equally distribute the starting probabilities

                m = ghmm.HMMFromMatrices(sigma,
                                         ghmm.DiscreteDistribution(sigma), A,
                                         B, pi)

                m.baumWelch(
                    ghmm.SequenceSet(sigma, train_data),
                    nrSteps=1000,
                    loglikelihoodCutoff=0.00005
                )  # Defaults: nrSteps=500, loglikelihoodCutoff=0.0001
                # print('Training Done')

                # print(m.asMatrices()[0])
                # print(m.asMatrices()[1])
                # print(m.asMatrices()[2])

                total_checked = 0
                total_correct = 0

                threshold_checked = 0
                threshold_correct = 0
    def fit(self, xData, A=None, B=None, pi=None, cov_mult=None,
            ml_pkl=None, use_pkl=False, cov_type='full', fixed_trans=0,\
            shuffle=False):
        '''
        Input :
        - xData: dimension x sample x length
        Issues:
        - If NaN is returned, the reason can be one of followings,
        -- lower cov
        -- small range of xData (you have to scale it up.)
        '''

        # Daehyung: What is the shape and type of input data?
        if shuffle:
            X = xData
            X = np.swapaxes(X, 0, 1)
            id_list = range(len(X))
            random.shuffle(id_list)
            X = np.array(X)[id_list]
            X = np.swapaxes(X, 0, 1)
        else:
            X = [np.array(data) for data in xData]
        nData = len(xData[0])

        param_dict = {}

        # Load pre-trained HMM without training
        if use_pkl and ml_pkl is not None and os.path.isfile(ml_pkl):
            if self.verbose: print "Load HMM parameters without train the hmm"

            param_dict = ut.load_pickle(ml_pkl)
            self.A = param_dict['A']
            self.B = param_dict['B']
            self.pi = param_dict['pi']
            if self.nEmissionDim == 1:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                               self.A, self.B, self.pi)
            else:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \
                                               self.A, self.B, self.pi)

            out_a_num = param_dict.get('out_a_num', None)
            vec_num = param_dict.get('vec_num', None)
            mat_num = param_dict.get('mat_num', None)
            u_denom = param_dict.get('u_denom', None)
            if out_a_num is not None:
                self.ml.setBaumWelchParams(out_a_num, vec_num, mat_num,
                                           u_denom)

            return True
        else:

            if ml_pkl is None:
                ml_pkl = os.path.join(os.path.dirname(__file__),
                                      'ml_temp_n.pkl')

            if cov_mult is None:
                cov_mult = [1.0] * (self.nEmissionDim**2)

            if A is None:
                if self.verbose: print "Generating a new A matrix"
                # Transition probability matrix (Initial transition probability, TODO?)
                A = util.init_trans_mat(self.nState).tolist()

            if B is None:
                if self.verbose: print "Generating a new B matrix"
                # We should think about multivariate Gaussian pdf.

                mus, cov = util.vectors_to_mean_cov(X,
                                                    self.nState,
                                                    self.nEmissionDim,
                                                    cov_type=cov_type)
                ## print np.shape(mus), np.shape(cov)

                # cov: state x dim x dim
                for i in xrange(self.nEmissionDim):
                    for j in xrange(self.nEmissionDim):
                        cov[:, i, j] *= cov_mult[self.nEmissionDim * i + j]

                if self.verbose:
                    for i, mu in enumerate(mus):
                        print 'mu%i' % i, mu
                    ## print 'cov', cov

                # Emission probability matrix
                B = [0] * self.nState
                for i in range(self.nState):
                    if self.nEmissionDim > 1:
                        B[i] = [[mu[i] for mu in mus]]
                        B[i].append(cov[i].flatten().tolist())
                    else:
                        B[i] = [np.squeeze(mus[0][i]), float(cov[i])]
            if pi is None:
                # pi - initial probabilities per state
                ## pi = [1.0/float(self.nState)] * self.nState
                pi = [0.0] * self.nState
                pi[0] = 1.0

            # print 'Generating HMM'
            # HMM model object
            if self.nEmissionDim == 1:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.GaussianDistribution(self.F), \
                                               A, B, pi)
            else:
                self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), \
                                               A, B, pi)
            if cov_type == 'diag': self.ml.setDiagonalCovariance(1)

            # print 'Creating Training Data'
            X_train = util.convert_sequence(X)  # Training input
            X_train = X_train.tolist()
            if self.verbose: print "training data size: ", np.shape(X_train)

            if self.verbose:
                print 'Run Baum Welch method with (samples, length)', np.shape(
                    X_train)
            final_seq = ghmm.SequenceSet(self.F, X_train)
            ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0)
            ret = self.ml.baumWelch(final_seq,
                                    10000)  #, fixedTrans=fixed_trans)
            if np.isnan(ret):
                print 'Baum Welch return:', ret
                return 'Failure'
            print 'Baum Welch return:', ret / float(nData)

            [self.A, self.B, self.pi] = self.ml.asMatrices()
            self.A = np.array(self.A)
            self.B = np.array(self.B)

            param_dict['A'] = self.A
            param_dict['B'] = self.B
            param_dict['pi'] = self.pi

            try:
                [out_a_num, vec_num, mat_num,
                 u_denom] = self.ml.getBaumWelchParams()
                param_dict['out_a_num'] = out_a_num
                param_dict['vec_num'] = vec_num
                param_dict['mat_num'] = mat_num
                param_dict['u_denom'] = u_denom
            except:
                print "Install new ghmm!!"

            if ml_pkl is not None: ut.save_pickle(param_dict, ml_pkl)
            return ret / float(nData)
    def fit(self, xData, A=None, B=None, pi=None, cov_mult=None,
            ml_pkl=None, use_pkl=False):

        if ml_pkl is None:
            ml_pkl = os.path.join(os.path.dirname(__file__), 'ml_temp_n.pkl')            
        
        if cov_mult is None:
            cov_mult = [1.0]*(self.nEmissionDim**2)

        # Daehyung: What is the shape and type of input data?
        X = [np.array(data) for data in xData]

        if A is None:
            if self.verbose: print "Generating a new A matrix"
            # Transition probability matrix (Initial transition probability, TODO?)
            A = self.init_trans_mat(self.nState).tolist()

        if B is None:
            if self.verbose: print "Generating a new B matrix"
            # We should think about multivariate Gaussian pdf.  

            mus, cov = self.vectors_to_mean_cov(X, self.nState)

            for i in xrange(self.nEmissionDim):
                for j in xrange(self.nEmissionDim):
                    cov[:, j, i] *= cov_mult[self.nEmissionDim*i + j]

            if self.verbose:
                for i, mu in enumerate(mus):
                    print 'mu%i' % i, mu
                print 'cov', cov
                
            # Emission probability matrix
            B = [0] * self.nState
            for i in range(self.nState):
                B[i] = [[mu[i] for mu in mus]]
                B[i].append(cov[i].flatten())
        if pi is None:
            # pi - initial probabilities per state 
            ## pi = [1.0/float(self.nState)] * self.nState
            pi = [0.0] * self.nState
            pi[0] = 1.0

        # print 'Generating HMM'
        # HMM model object
        self.ml = ghmm.HMMFromMatrices(self.F, ghmm.MultivariateGaussianDistribution(self.F), A, B, pi)
        # print 'Creating Training Data'
        X_train = self.convert_sequence(X) # Training input
        X_train = X_train.tolist()
        
        if self.verbose: print 'Run Baum Welch method with (samples, length)', np.shape(X_train)
        final_seq = ghmm.SequenceSet(self.F, X_train)
        ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0)
        ret = self.ml.baumWelch(final_seq, 10000)
        print 'Baum Welch return:', ret
        if np.isnan(ret): return 'Failure'

        [self.A, self.B, self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)

        #--------------- learning for anomaly detection ----------------------------
        [A, B, pi] = self.ml.asMatrices()
        n, m = np.shape(X[0])
        self.nGaussian = self.nState

        if self.check_method == 'change' or self.check_method == 'globalChange':
            # Get maximum change of loglikelihood over whole time
            ll_delta_logp = []
            for j in xrange(n):    
                l_logp = []                
                for k in xrange(1, m):
                    final_ts_obj = ghmm.EmissionSequence(self.F, X_train[j][:k*self.nEmissionDim])
                    logp         = self.ml.loglikelihoods(final_ts_obj)[0]

                    l_logp.append(logp)
                l_delta_logp = np.array(l_logp[1:]) - np.array(l_logp[:-1])                    
                ll_delta_logp.append(l_delta_logp)

            self.l_mean_delta = np.mean(abs(np.array(ll_delta_logp).flatten()))
            self.l_std_delta = np.std(abs(np.array(ll_delta_logp).flatten()))

            if self.verbose: 
                print "mean_delta: ", self.l_mean_delta, " std_delta: ", self.l_std_delta
        
        
        if self.check_method == 'global' or self.check_method == 'globalChange':
            # Get average loglikelihood threshold over whole time

            l_logp = []
            for j in xrange(n):
                for k in xrange(1, m):
                    final_ts_obj = ghmm.EmissionSequence(self.F, X_train[j][:k*self.nEmissionDim])
                    logp         = self.ml.loglikelihoods(final_ts_obj)[0]

                    l_logp.append(logp)

            self.l_mu = np.mean(l_logp)
            self.l_std = np.std(l_logp)
            
        elif self.check_method == 'progress':
            # Get average loglikelihood threshold wrt progress

            if os.path.isfile(ml_pkl) and use_pkl:
                if self.verbose: print 'Load detector parameters'
                d = ut.load_pickle(ml_pkl)
                self.l_statePosterior = d['state_post'] # time x state division
                self.ll_mu            = d['ll_mu']
                self.ll_std           = d['ll_std']
            else:
                if self.cluster_type == 'time':                
                    if self.verbose: print 'Begining parallel job'
                    self.std_coff  = 1.0
                    g_mu_list = np.linspace(0, m-1, self.nGaussian) #, dtype=np.dtype(np.int16))
                    g_sig = float(m) / float(self.nGaussian) * self.std_coff
                    r = Parallel(n_jobs=-1)(delayed(learn_likelihoods_progress)(i, n, m, A, B, pi, self.F, X_train,
                                                                           self.nEmissionDim, g_mu_list[i], g_sig, self.nState)
                                                                           for i in xrange(self.nGaussian))
                    if self.verbose: print 'Completed parallel job'
                    l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r)

                elif self.cluster_type == 'state':
                    self.km = None                    
                    self.ll_mu = None
                    self.ll_std = None
                    self.ll_mu, self.ll_std = self.state_clustering(X)
                    path_mat  = np.zeros((self.nState, m*n))
                    likelihood_mat = np.zeros((1, m*n))
                    self.l_statePosterior=None
                    
                d = dict()
                d['state_post'] = self.l_statePosterior
                d['ll_mu'] = self.ll_mu
                d['ll_std'] = self.ll_std
                ut.save_pickle(d, ml_pkl)
    def fit(self,
            xData1,
            xData2=None,
            A=None,
            B=None,
            pi=None,
            cov_mult=(1.0, 1.0, 1.0, 1.0),
            verbose=False,
            ml_pkl='ml_temp.pkl',
            use_pkl=False):
        ml_pkl = os.path.join(os.path.dirname(__file__), ml_pkl)
        X1 = np.array(xData1)
        X2 = np.array(xData2)

        if A is None:
            if verbose: print "Generating a new A matrix"
            # Transition probability matrix (Initial transition probability, TODO?)
            A = self.init_trans_mat(self.nState).tolist()

        if B is None:
            if verbose: print "Generating a new B matrix"
            # We should think about multivariate Gaussian pdf.

            mu1, mu2, cov = self.vectors_to_mean_cov(X1, X2, self.nState)
            cov[:, 0, 0] *= cov_mult[0]  #1.5 # to avoid No convergence warning
            cov[:, 1, 0] *= cov_mult[1]  #5.5 # to avoid No convergence warning
            cov[:, 0, 1] *= cov_mult[2]  #5.5 # to avoid No convergence warning
            cov[:, 1, 1] *= cov_mult[3]  #5.5 # to avoid No convergence warning

            # Emission probability matrix
            B = [0.0] * self.nState
            for i in range(self.nState):
                B[i] = [[mu1[i], mu2[i]],
                        [
                            cov[i, 0, 0], cov[i, 0, 1], cov[i, 1, 0], cov[i, 1,
                                                                          1]
                        ]]

        if pi is None:
            # pi - initial probabilities per state
            ## pi = [1.0/float(self.nState)] * self.nState
            pi = [0.0] * self.nState
            pi[0] = 1.0

        # HMM model object
        self.ml = ghmm.HMMFromMatrices(
            self.F, ghmm.MultivariateGaussianDistribution(self.F), A, B, pi)
        X_train = self.convert_sequence(X1, X2)  # Training input
        X_train = X_train.tolist()

        print 'Run Baum Welch method with (samples, length)', np.shape(X_train)
        final_seq = ghmm.SequenceSet(self.F, X_train)
        ## ret = self.ml.baumWelch(final_seq, loglikelihoodCutoff=2.0)
        ret = self.ml.baumWelch(final_seq, 10000)
        print 'Baum Welch return:', ret

        [self.A, self.B, self.pi] = self.ml.asMatrices()
        self.A = np.array(self.A)
        self.B = np.array(self.B)

        #--------------- learning for anomaly detection ----------------------------
        [A, B, pi] = self.ml.asMatrices()
        n, m = np.shape(X1)
        self.nGaussian = self.nState

        if self.check_method == 'change' or self.check_method == 'globalChange':
            # Get maximum change of loglikelihood over whole time
            ll_delta_logp = []
            for j in xrange(n):
                l_logp = []
                for k in xrange(1, m):
                    final_ts_obj = ghmm.EmissionSequence(
                        self.F, X_train[j][:k * self.nEmissionDim])
                    logp = self.ml.loglikelihoods(final_ts_obj)[0]

                    l_logp.append(logp)
                l_delta_logp = np.array(l_logp[1:]) - np.array(l_logp[:-1])
                ll_delta_logp.append(l_delta_logp)

            self.l_mean_delta = np.mean(abs(np.array(ll_delta_logp).flatten()))
            self.l_std_delta = np.std(abs(np.array(ll_delta_logp).flatten()))

            print "mean_delta: ", self.l_mean_delta, " std_delta: ", self.l_std_delta

        if self.check_method == 'global' or self.check_method == 'globalChange':
            # Get average loglikelihood threshold over whole time

            l_logp = []
            for j in xrange(n):
                for k in xrange(1, m):
                    final_ts_obj = ghmm.EmissionSequence(
                        self.F, X_train[j][:k * self.nEmissionDim])
                    logp = self.ml.loglikelihoods(final_ts_obj)[0]

                    l_logp.append(logp)

            self.l_mu = np.mean(l_logp)
            self.l_std = np.std(l_logp)

        elif self.check_method == 'progress':
            # Get average loglikelihood threshold wrt progress
            self.std_coff = 1.0
            g_mu_list = np.linspace(
                0, m - 1, self.nGaussian)  #, dtype=np.dtype(np.int16))
            g_sig = float(m) / float(self.nGaussian) * self.std_coff

            ######################################################################################
            if os.path.isfile(ml_pkl) and use_pkl:
                with open(ml_pkl, 'rb') as f:
                    d = pickle.load(f)
                    self.l_statePosterior = d[
                        'state_post']  # time x state division
                    self.ll_mu = d['ll_mu']
                    self.ll_std = d['ll_std']
            else:
                n_jobs = -1
                r = Parallel(n_jobs=n_jobs)(
                    delayed(learn_likelihoods_progress)(
                        i, n, m, A, B, pi, self.F, X_train, self.nEmissionDim,
                        g_mu_list[i], g_sig, self.nState)
                    for i in xrange(self.nGaussian))
                l_i, self.l_statePosterior, self.ll_mu, self.ll_std = zip(*r)

                d = dict()
                d['state_post'] = self.l_statePosterior
                d['ll_mu'] = self.ll_mu
                d['ll_std'] = self.ll_std
                with open(ml_pkl, 'wb') as f:
                    pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #28
0
 def get_seqs_set(self, obs_seqs):
     obs_seqs = self.get_obs_seqs(obs_seqs)
     obs_seqs_set = ghmm.SequenceSet(self.emission_domain, obs_seqs)
     return obs_seqs_set