def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float("-inf")
        best_model = None
        for n in range(self.min_n_components, self.max_n_components+1):
            try:
                other_words_score = 0.0
                quantity = 0.0
                model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                this_word_score = model.score(self.X, self.lengths)
                for word in self.hwords:
                    if word != self.this_word:
                        quantity += 1
                        X, lengths = self.hwords[word]
                        other_words_score += model.score(X, lengths)
                # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10
                score = this_word_score - other_words_score / quantity
                if score > best_score:
                    best_score = score
                    best_model = model
            except:
                continue
        return best_model
def mainHMM(filePrefix):
    X_train, length_train, X_test, length_test = loadOneRoute(filePrefix)
    # Run Gaussian HMM
    print "fitting to HMM and decoding ..."
    model = GaussianHMM(n_components=4, covariance_type="diag", n_iter=2000).fit(X_train[:, 0:5], length_train)
    hidden_states = model.predict(X_test[:, 0:5], length_test)
    print "done"

    print hidden_states[0:20]
    print hidden_states[20:40]
    print hidden_states[40:60]
    print hidden_states[60:80]

    # Print trained parameters and plot
    print("Transition matrix")
    print(model.transmat_)
    print("Start Prob")
    print(model.startprob_)

    print("Means and vars of each hidden state")
    for i in range(model.n_components):
        print("{0}th hidden state".format(i))
        print("mean = ", model.means_[i])
        print("var = ", np.diag(model.covars_[i]))


    print np.array(hidden_states).reshape((sum(length_test), 1))
 def addModel(self, nom, data, nbEtats, n_iter, startprob_prior=None, transmat_prior=None):
     '''
     ajoute un model à tabModels
     
     paramètres :
     nom = nom du modèle
     data = tableau à trois dimension représentant un cluster possèdant des mouvements possèdant lui même des positions        
     nbEtats = nombre d'états cachés pour chaque modèle
     n_iter = nombre d'itérations pour l'algorithme de Baum-Welch
     startprob_prior = la matrice initiale à priori
     transmat_prior = la matrice de transition à priori des états
     '''
     model = GaussianHMM(nbEtats, covariance_type="diag", n_iter=n_iter, startprob_prior=startprob_prior, transmat_prior=transmat_prior)      
     model.fit(data)
     verif_set_transMat(model)
     taille = len(self.tabModels)
     if(taille == 0):
         self.tabModels.append([nom])
         self.tabModels[0].append(model)
         return
     for i in range(taille):        
         if(self.tabModels[i][0] == nom):
             self.tabModels[i].append(model)
             return
     self.tabModels.append([nom])
     self.tabModels[-1].append(model)
Beispiel #4
0
def fit_batch(traj_data, n_components=2, subsample_factor=1,
              features=['speed', 'rotation'], **kwargs):
    '''
    Fits model to concatenated traj_data
    Args:
        traj_data - list of paths of training dataset (trajectory csv)
        n_components - number of hidden states
        subsample_factor - subsample factor to apply to all files
        features - columns to fit model to
        **kwargs passed to GaussianHMM
    Returns:
        model - fitted model
    '''
    # Concatenate data
    feature_list = []
    lengths_list = []
    for path in traj_data:
        X, l = features_from_csv(path, features=features,
                                 subsample_factor=subsample_factor)
        feature_list.append(X)
        lengths_list.append(l)
    print 'Concatenating features...'
    X = np.vstack(feature_list)
    l = np.hstack(lengths_list)

    # Fit HMM
    print 'Fitting model...'
    model = GaussianHMM(n_components, **kwargs)
    model.fit(X, lengths=l)

    return model
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on BIC scores
        # raise NotImplementedError
        record = float("inf")

        min_seq = min([len(seq) for seq in self.sequences])    
        self.max_n_components = min (self.max_n_components, min_seq) 

        hmm_model = self.base_model(self.n_constant)
        for num in range(self.min_n_components,self.max_n_components+1,1):
            #print(num)
            try: 
                model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths)
                logL = model.score(self.X, self.lengths)
                # p is the number of free parameters, N is the number of data points
                p = num*num + 2* num* len(self.X[0]) -1
                BIC = -2* logL + p * np.log(len(self.X))
                if BIC < record:
                    record = BIC
                    hmm_model = model  
            except:
                continue
                # print("failure on {} with {} states".format(self.this_word, num))         
        return hmm_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on DIC scores
        # raise NotImplementedError
        record = float("-inf")

        min_seq = min([len(seq) for seq in self.sequences])    
        self.max_n_components = min (self.max_n_components, min_seq) 

        hmm_model = self.base_model(self.n_constant)
        for num in range(self.min_n_components,self.max_n_components+1,1):
            #print(num)
            try: 
                model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths)
                logL = model.score(self.X, self.lengths)
                tmp = 0
                for word in self.hwords:
                    X, lengths = self.hwords[word]
                    tmp += model.score(X,lengths)
                DIC = logL - (tmp-logL) /(len(self.hwords)-1)   
                if DIC > record:
                    record = DIC
                    hmm_model = model   
            except:
                continue
                # print("failure on {} with {} states".format(self.this_word, num))          
        return hmm_model
Beispiel #7
0
	def fit(self):

		if self.verbose:
			print "[Clustering] Clearing old model and segmentation"
		
		self.segmentation = []
		self.model = []


		new_segments = []
		new_model = []

		g = GaussianHMM(n_components=self.n_components)

		all_demos = self._demonstrations[0]
		lens = [np.shape(self._demonstrations[0])[0]]
		for i in range(1, len(self._demonstrations)):
			all_demos = np.concatenate([all_demos,self._demonstrations[i]])
			lens.append(np.shape(self._demonstrations[i])[0])

		g.fit(all_demos,lens) 
			
		for d in self._demonstrations:
			new_segments.append(self.findTransitions(g.predict(d)))
			#print g.predict(d)
			new_model.append(g)

		self.segmentation = new_segments
		self.model = new_model
 def fit_HMM(self,error_metric):
     print "Looking for optimal number of states and fitting HMM"
     for i in xrange(2,5):
         candidate = GaussianHMM(n_components=i, covariance_type="full", n_iter=1000)
         candidate.fit(self.X_train)
         if error_metric == HMM_MAD:
             error = HMM_MAD(candidate,self.X_test)
             if i == 2:
                 best_guess = error
                 best_model = candidate
                 opt_n_states = i
             else:
                 if error < best_guess:
                     opt_n_states = i
                     best_model = candidate
                     best_guess = error
         else:
             error = error_metric(candidate,self.X_test)
             if i == 2:
                 best_guess = error
                 best_model = candidate
                 opt_n_states = i
             else:
                 if error > best_guess:
                     opt_n_states = i
                     best_model = candidate
                     best_guess = error
     self.model = best_model
     self.n_states = opt_n_states
     print "Done. Lowest error of {} achieved with {} states".format(best_guess, opt_n_states)
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float('inf')
        best_model = None
        for n in range(self.min_n_components, self.max_n_components+1):
            try:
                model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                logL = model.score(self.X, self.lengths)
                # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10
                p = n ** 2 + 2 * n * len(self.X[0])  - 1
                N = len(self.X)
                score = -2 * logL + p * np.log(N)
                if score < best_score:
                    best_score = score
                    best_model = model
            except:
                continue
        return best_model
Beispiel #10
0
def main(args):
    x, X = loadDiffRows(args.diffFile)
    model = GaussianHMM(n_components=3,
                        covariance_type="diag",
                        n_iter=100000000000)
    model.transmat_ = numpy.array([[0.5, 0.5, 0.0],
                                   [0.0, 0.5, 0.5],
                                   [0.0, 0.0, 1.0]])
    model.fit(X)
    print(model.transmat_)
    model.transmat_[0][2] = 0.
    model.transmat_[1][0] = 0.
    model.transmat_[2][0] = 0.
    model.transmat_[2][1] = 0.
    
    exp = args.outFile.split('/')[-1].split('_')[0]
    with open(args.outFile, 'w') as fout:
        print('exp\tbin\treads\tstate', file=fout)
        for seq in X:
            hiddenStates = model.predict(seq)
            for idx,v in enumerate(zip(x,hiddenStates)):
                r,h = v
                print(exp + '\t' + str(idx) + '\t'
                      + str(r) + '\t' + str(h),
                      file=fout)
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float('-inf')
        best_model = None
        sum_score = 0.0
        counter = 0.0
        if len(self.sequences) >= 3:
            n_splits = min(3, len(self.sequences))
            splits = KFold(n_splits)
            for n in range(self.min_n_components, self.max_n_components+1):
                try:
                    for train_index, test_index in splits.split(self.sequences):
                        # used forum code to get train/test X,Lengths respectively: https://discussions.udacity.com/t/selectorcv-crashes/400125
                        train_X, train_lengths = combine_sequences(train_index, self.sequences)
                        test_X, test_lengths = combine_sequences(test_index, self.sequences)
                        model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                                random_state=self.random_state, verbose=False).fit(train_X, train_lengths)
                        score = model.score(test_X, test_lengths)
                        sum_score += score
                        counter += 1
                    # used average score from udacity forum: https://discussions.udacity.com/t/my-selectorcv-class/349110
                    average_score = sum_score / counter
                    if average_score > best_score:
                        best_score = average_score
                        best_model = model
                except:
                    continue
        # for models with length less than 3
        else:
            best_score_1 = float('inf')
            best_model = None
            for n in range(self.min_n_components, self.max_n_components+1):
                try:
                    model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                            random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    logL = model.score(self.X, self.lengths)
                    # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10
                    p = n ** 2 + 2 * n * len(self.X[0])  - 1
                    N = len(self.X)
                    score_1 = -2 * logL + p * np.log(N)
                    if score_1 < best_score_1:
                        best_score_1 = score_1
                        best_model = model
                except:
                    continue
        return best_model
Beispiel #12
0
def fit_hmm(df, n_components, features=['speed', 'rotation'],
            **kwargs):
    '''
    Fits a Gaussian HMM to the velocity data
    Args:
        df - dataframe containing positional data to be processed
        n_components - number of hidden states
        features - features to use in model fitting
        **kwargs passed to GaussianHMM
    Returns:
        model
    '''
    X, lengths = get_features(df, features=features)
    model = GaussianHMM(n_components, **kwargs)
    model.fit(X, lengths=lengths)

    return model
Beispiel #13
0
        def setup():

            def load_patterns(file):
                patterns = None
                sizes = np.zeros(len(words))
                counter = 0

                f = open(file, 'rb')
                data = f.readlines()

                stack = []
                for i in range(np.shape(data)[0]):
                    data2 = map(float, data[i].split())
                    data2 = np.reshape(data2, (1, -1))
                    if i == 0:
                        stack = data2
                    else:
                        stack = np.vstack((stack, data2))

                f.close()
                sizes[counter] = np.shape(stack)[0]
                counter += 1

                if patterns is None:
                    patterns = stack
                else:
                    patterns = np.vstack((patterns, stack))

                return patterns

            hidden = 1

            self.go_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('go.bin'))

            self.back_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('back.bin'))

            self.right_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('right.bin'))

            self.left_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('left.bin'))

            self.stop_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('stop.bin'))
Beispiel #14
0
    def train(self, data, n_components):
        print("Training Data: %s" % data)
        self.data = data
        self.model = GaussianHMM(n_components, algorithm='viterbi', covariance_type='diag')
        X = np.reshape(data, (len(data),1))
        self.model = self.model.fit([X])

        self.hidden_states = self.model.predict(X)
        print("Sequence of States: " % self.hidden_states)
Beispiel #15
0
class HMM:
    __slots__ = [
        "model"
    ]

    def __init__(self):
        pass


    def draw(self, data):
        figure()
        plot(range(len(data)),data,alpha=0.8,color='red')
        show()


    def train(self, data, n_components):
        print("Training Data: %s" % data)
        self.data = data
        self.model = GaussianHMM(n_components, algorithm='viterbi', covariance_type='diag')
        X = np.reshape(data, (len(data),1))
        self.model = self.model.fit([X])

        self.hidden_states = self.model.predict(X)
        print("Sequence of States: " % self.hidden_states)


    def eval(self, obs):
        print("Testing Data: %s" % obs)
        X = np.reshape(obs, (len(obs),1))
        print("Eval: %s" % str(self.model.score(X)))


    def plot(self):
        fig = figure(facecolor="white")
        ax = fig.add_subplot(111)

        for i in range(self.model.n_components):
            # use fancy indexing to plot data in each state
            idx = (self.hidden_states == i)
            ax.plot(np.array(range(len(self.data)))[idx], np.array(self.data)[idx], '.', label="State %d" % (i+1))

        ax.legend()
        show()
    def test_backward_with_hmmlearn(self):
        r = np.random.randn
        obs = [np.array([[-600 + r(), 100 + r()], [-300 + r(), 200 + r()], [0 + r(), 300 + r()]]) for _ in xrange(10)]
        hmm = GaussianHMM(n_components=3)
        hmm.fit(obs)

        # Calculcate bwdlattice using hmmlearn algorithm
        framelogprob = hmm._compute_log_likelihood(obs[0])
        start = timeit.default_timer()
        bwdlattice1 = hmm._do_backward_pass(framelogprob)
        print('hmmlearn took %fs' % (timeit.default_timer() - start))

        # Calculate bwdlattice using fhmm algorithm with #chains = 1. This should yield the exact same results
        start = timeit.default_timer()
        bwdlattice2 = np.zeros(bwdlattice1.shape)
        fhmmc._backward(obs[0].shape[0], 1, hmm.n_components, [(x,) for x in xrange(hmm.n_components)],
                        hmm._log_startprob.reshape(1, 3), hmm._log_transmat.reshape(1, 3, 3), framelogprob, bwdlattice2)
        print('fhmm took %fs' % (timeit.default_timer() - start))
        self.assertTrue(np.allclose(bwdlattice1, bwdlattice2))
class HmmClassifier():
    def __init__(self, referenceSeqs, inputSeq):
        self.referenceSeqs = referenceSeqs
        self.inputSeq = inputSeq

        # feel free to change this model
        self.model = GaussianHMM(n_components=2, covariance_type="full", n_iter=2000)

    def predict(self):
        probs = []
        for referenceSeq in self.referenceSeqs:
            #print "reference: {}".format(referenceSeq)
            self.model.fit(referenceSeq)
            hidden_states = self.model.predict(referenceSeq)
            prob = self.model.score(self.inputSeq)
            probs.append(prob)

        # return the index of the max prob
        return probs.index(max(probs))
def calculate_hmm_g(training_set, test_set, taxonomy, cursor, connection, settings):
    da_id_taxonomy = find_da_id(taxonomy, cursor)
    states, start_probability, transition_probability = start_transition_probability_extraction(training_set, taxonomy)
    n_states = len(states)

    feature_list = extract_features_training_set_gaus(training_set, taxonomy, settings)
    n_features = len(feature_list[states[0]][0])
    mean = calculate_means(states, feature_list, n_features)
    covariance = calculate_covariance(states, feature_list, n_features)
    # covariance = diag_cov(states, feature_list, n_features, mean)

    model = GaussianHMM(n_components=n_states, covariance_type='full')
    model.startprob_ = start_probability
    model.transmat_ = transition_probability
    model.means_ = mean
    model.covars_ = covariance

    test_seq, con_pathes = extract_features_test_set_gaus(test_set, taxonomy, settings)
    da_predictions(test_seq, model, con_pathes, states, da_id_taxonomy, taxonomy, cursor, connection)
Beispiel #19
0
    def __init__(self, n_components=1, covariance_type='diag', min_covar=1e-3, startprob_prior=1.0,
                 transmat_prior=1.0, means_prior=0, means_weight=0, covars_prior=1e-2, covars_weight=1,
                 algorithm="viterbi", random_state=None, n_iter=5, tol=1e-2, verbose=False,
                 params="stmc", init_params="stmc", states_prior=None, fp_state=None):
        GaussianHMM.__init__(self, n_components=n_components, covariance_type=covariance_type,
                             min_covar=min_covar, startprob_prior=startprob_prior, transmat_prior=transmat_prior,
                             means_prior=means_prior, means_weight=means_weight,
                             covars_prior=covars_prior, covars_weight=covars_weight,
                             algorithm=algorithm, random_state=random_state,
                             n_iter=n_iter, tol=tol, verbose=verbose,
                             params=params, init_params=init_params)

        self.covariance_type = covariance_type
        self.min_covar = min_covar
        self.means_prior = means_prior
        self.means_weight = means_weight
        self.covars_prior = covars_prior
        self.covars_weight = covars_weight
        self.states_prior = states_prior
        self.fp_state = fp_state
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        # raise NotImplementedError
        record = float("-inf")

        min_seq = min([len(seq) for seq in self.sequences])    
        self.max_n_components = min (self.max_n_components, min_seq) 
        hmm_model = self.base_model(self.n_constant)
        if len(self.sequences) == 1:
            return hmm_model
        elif len(self.sequences) == 2:
            split_method = KFold(n_splits=2)
            #self.max_n_components = 3
        else:
            split_method = KFold(n_splits=3,random_state=self.random_state)
        
        
        for num in range(self.min_n_components,self.max_n_components+1,1):
            #print(num)
            logL = 0
            cnt = 0
            
            for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                #print("Train fold indices:{} Test fold indices:{}".format(cv_train_idx, cv_test_idx))  # view indices of the folds
                X, lengths = combine_sequences(cv_train_idx,self.sequences)
                try:
                    model = GaussianHMM(n_components= num, n_iter=1000).fit(X, lengths)
                    X, lengths = combine_sequences(cv_test_idx,self.sequences)
                    logL += model.score(X, lengths)
                except:
                    continue
                    #print("failure on {} with {} states".format(self.this_word, num))                      
            if cnt> 0 and logL/cnt > record:
                record = logL
                hmm_model = model   
        return hmm_model
    def calculate_weights(self, date, amount):
        if self.stacked == False:
            for elements in self.tradingDates:
                if elements.get('dt') >= self.start_date and elements.get('dt') <= date :
                    self.trainingDates.append(elements['dt'])
            for assetCode in self.asset_codes:
                assetValues = []
#                 for each_date in self.trainingDates:
#                     assetValues.append(StockData.objects.filter(dt=each_date,ticker=assetCode).values("price_close")[0]['price_close'])
                assetValues = [StockData.objects.filter(dt=each_date,ticker=assetCode).values("price_close")[0]['price_close'] for each_date in self.trainingDates]    
                self.historical_Data[assetCode] = assetValues
            self.stacked = True
        else:
            assetValues = []
            for assetCode in self.asset_codes:
                self.historical_Data[assetCode].append(StockData.objects.filter(dt=date,ticker=assetCode).values("price_close")[0]['price_close'])    
        
        target = {'money': amount}    
        for assetCode in self.asset_codes:
            close_v = np.array(self.historical_Data[assetCode])
            diff = np.diff(close_v)
            X = np.column_stack([diff])
            model = GaussianHMM(n_components=2, covariance_type="diag", n_iter=1000).fit(X)
            hidden_states = model.predict(X)
            stableProb = 0
            if hidden_states[len(hidden_states) - 1] == 1:
                stableProb = model.transmat_[1][1]
            else:
                stableProb = 0
            target[assetCode] = stableProb
            target['money'] -= stableProb * close_v[len(close_v) - 1]
            
        self.weight = []
        self.weight.append(target['money'])
#         for assetCode in self.asset_codes:
#             self.weight.append(target[assetCode])
        self.weight += [target[assetCode] for assetCode in self.asset_codes]    
        return self.weight
def hmmtest(trade_data, test_data):
    # pack diff and volume for training
    # delete record containng infinity    
    X = test_data[test_data['Strategy_Gross_Return_RDP_5'] != float("inf")]
    X = test_data
    ###############################################################################
    # Run Gaussian HMM
    #print("fitting to HMM and decoding ...", end='')
    n_components = 4
    covariance_type = 'full'
    n_iter = 1000
    
    # make an HMM instance and execute fit
    model = GaussianHMM(n_components=n_components, covariance_type=covariance_type, n_iter=n_iter).fit(X)
    #model= GMMHMM(n_components=4,n_mix=3,covariance_type="diag", n_iter=100).fit(X)
    # model = MultinomialHMM(n_components=4, n_iter=100).fit(X)
    # predict the optimal sequence of internal hidden state
    hidden_states = model.predict(X)
    
    #print("done\n")
    
    ###############################################################################
    # print trained parameters and plot
    #print("Transition matrix")
    #print(model.transmat_)
    #print()
    
    print("means and vars of each hidden state")
    for i in range(model.n_components):
        print("%dth hidden state" % i)
        print("mean = ", model.means_[i])
        print("var = ", np.diag(model.covars_[i]))
        
        
    plotHmmState(model, hidden_states, trade_data)
    
    return model
Beispiel #23
0
def bench_gaussian_hmm(size):
    title = "benchmarking Gaussian HMM on a sample of size {0}".format(size)
    print(title.center(36, " "))
    ghmm = GaussianHMM()
    ghmm.means_ = [[42], [24]]
    ghmm.covars_ = [[1], [1]]

    with timed_step("generating sample"):
        sample, _states = ghmm.sample(size)

    with timed_step("fitting"):
        fit = GaussianHMM(n_components=2).fit([sample])

    with timed_step("estimating states"):
        fit.predict(sample)
 def __create_model__(self, num_states, measurements, measurement_lengths):
     # with warnings.catch_warnings():
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     warnings.filterwarnings("ignore", category=RuntimeWarning)
     try:
         hmm_model = GaussianHMM(n_components=num_states, 
                                 covariance_type="diag", 
                                 n_iter=1000,
                                 random_state=self.random_seed, 
                                 verbose=False)\
                     .fit(measurements, measurement_lengths)
         if self.verbose:
             print("model created for {} with {} states".format(self.this_word, num_states))
         return hmm_model
     except Exception as e:
         if self.verbose:
             traceback.print_exc()
             print("failure on {} with {} states".format(self.this_word, num_states))
         raise
Beispiel #25
0
 def base_model(self, num_states):
     # with warnings.catch_warnings():
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     # warnings.filterwarnings("ignore", category=RuntimeWarning)
     try:
         hmm_model = GaussianHMM(n_components=num_states,
                                 covariance_type="diag",
                                 n_iter=1000,
                                 random_state=self.random_state,
                                 verbose=False).fit(self.X, self.lengths)
         if self.verbose:
             print("model created for {} with {} states".format(
                 self.this_word, num_states))
         return hmm_model
     except:
         if self.verbose:
             print("failure on {} with {} states".format(
                 self.this_word, num_states))
         return None
def cmodel(company, dt1, dt2, num_of_states):
    
    quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) #Here we set the time range

    # Unpack the quotes !
    dates = np.array([q[0] for q in quotes], dtype=int)
    close_v = np.array([q[2] for q in quotes])

    # Take diff of close value and shift by 1
    
    diff = np.diff(close_v)  

    
    dates = dates[1:]
    close_v = close_v[1:]
    
    X = np.column_stack([diff])    
    
    # Create HMM instance and fit 
    model = GaussianHMM(n_components=num_of_states, covariance_type="full", n_iter=1000).fit(X)
    #print ("Model Covars: ", model.covars_)

    expected_days = 1
    tr_mls = 1
    
    if (num_of_states > 1):
        #Identify the most likely last hidden state
        
        try:
            hidden_probs = model.predict_proba(X)
        except:
            model = GaussianHMM(n_components=num_of_states, covariance_type="diag", n_iter=1000).fit(X)
            hidden_probs = model.predict_proba(X)
            
        lstate_prob = hidden_probs[-1] 
        mls = lstate_prob.argmax()

        # self transition probability for the most likely last hidden state
        tr_mls = model.transmat_[mls][mls]

        # we make use of the geometric series formula to calculate the number
        # of days expected to stay at the current state
        expected_days = (1.0 / (1 - tr_mls))
    
    # we save the model for future use
    fname = str(company)+"_"+str(num_of_states)+"_states_model_final.pkl"
    joblib.dump(model, os.path.join('./sims_final', fname)) 
    
    #return expected days
    return expected_days, tr_mls
Beispiel #27
0
    def averageLL(self, n):
        # Set best model to nonexistant
        best_mod = None
        bestLogL = float("-inf")

        # Split sequences (default to 3 unless self.sequences length is too short)

        if len(self.sequences) < 2:
            try:
                mod_n = GaussianHMM(n_components=n,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(self.X, self.lengths)
                logL = mod_n.score(self.X, self.lengths)
            except:
                logL = float("-inf")
                mod_n = None
            return logL, mod_n

        n_splits = min(3, len(self.sequences))
        split_method = KFold(n_splits=n_splits)

        for cv_train_idx, cv_test_idx in split_method.split(self.sequences):

            # for a given split of test/train data, train model and test log loss
            Xtrain, lengths_train = combine_sequences(cv_train_idx,
                                                      self.sequences)
            Xtest, lengths_test = combine_sequences(cv_test_idx,
                                                    self.sequences)

            try:
                mod_n = GaussianHMM(n_components=n,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(Xtrain, lengths_train)
                logL = mod_n.score(Xtest, lengths_test)

                # If this model is better than previous best, set this as the new best
                if logL > bestLogL:
                    bestLogL = logL
                    best_mod = mod_n

            # in case model create or score throws exception, ignore this model and test next one
            except:
                logL = float("-inf")
                mod_n = None

        return bestLogL, best_mod
Beispiel #28
0
def fit_and_apply_hmm(normal, infected, chosen, data):
    # define sliding window size and number of components
    win, components = 4, 5
    # uncomment the next line to find the optimal window size and number of components
    # it takes some time though...
    # win, components = find_optimal_params(chosen)

    win_data = get_windows(chosen, win)

    # learn a Gaussian Hidden Markov Model with 4 states from the infected host data
    hmm = GaussianHMM(n_components=components)
    hmm.fit(win_data)
    # store the log-likelihood of the host that trained the model
    modeled_log_likelihood = hmm.decode(win_data)[0]

    hosts_log_likelihood = {}

    # compute log-likelihood of data sequence of normal IPs
    for ip in normal:
        # get the flows of that host only
        host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)]
        size = len(host_data) - win
        # if host has enough flows for creating a window
        if size > 0:
            # create sliding windows sequences
            normal_data = get_windows(host_data, win)
            # get the log-likelihood of the sequential data
            hosts_log_likelihood[ip] = hmm.decode(normal_data)[0]
        else:
            hosts_log_likelihood[ip] = 0

    # repeat procedure for all infected IPs
    for ip in infected:
        # get the flows of that host only
        host_data = data[(data['src_ip'] == ip) | (data['dst_ip'] == ip)]
        size = len(host_data) - win
        # if host has enough flows for creating a window
        if size > 0:
            # create sliding windows sequences
            infected_data = get_windows(host_data, win)
            # get the log-likelihood of the sequential data
            hosts_log_likelihood[ip] = hmm.decode(infected_data)[0]
        else:
            hosts_log_likelihood[ip] = 0
    return hosts_log_likelihood, modeled_log_likelihood
Beispiel #29
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score, best_n_components = None, None

        for n_components in range(self.min_n_components,
                                  self.max_n_components + 1):
            scores, n_splits = [], 3
            if (len(self.sequences) < 3):
                try:
                    model = GaussianHMM(n_components=n_components,
                                        n_iter=1000).fit(self.X, self.lengths)
                    logL = model.score(self.X, self.lengths)
                    if (best_score is None or logL > best_score):
                        best_score, best_n_components = logL, n_components
                except Exception as e:
                    # Skip cross-validation for current n_components
                    continue
            else:
                split_method = KFold(random_state=self.random_state,
                                     n_splits=n_splits)

                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    X_train, lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    X_test, lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    try:
                        model = GaussianHMM(n_components=n_components,
                                            n_iter=1000).fit(
                                                X_train, lengths_train)
                        logL = model.score(X_test, lengths_test)
                        scores.append(logL)
                    except Exception as e:
                        break

                training_successful = len(scores) == n_splits

                if (not training_successful): continue

                avg = np.average(scores)
                if (best_score is None or avg > best_score):
                    best_score, best_n_components = avg, n_components

        if (best_score == None):
            best_n_components = 3

        return self.base_model(best_n_components)
    def get_model(self):

        self.pipe_pca = make_pipeline(
            StandardScaler(), PrincipalComponentAnalysis(n_components=3),
            GaussianHMM(n_components=3, covariance_type='full',
                        random_state=7))

        self.pipe_pca.fit(self.train[['return'] + self.features])
        model = self.pipe_pca.steps[2][1]

        results = []
        for i in range(3):
            result = [i, model.means_[i][0], np.diag(model.covars_[i])[0]]
            results.append(result)

        results = pd.DataFrame(results)
        results.columns = ['state', 'train_mean', 'train_var']
        self.results = results.set_index('state')

        self.get_renamed_states()
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float('-inf')
        if len(self.sequences) == 1:
            isSplit = False
        else:
            isSplit = True
            nFolds = min(3, len(self.sequences))
            split_method = KFold(n_splits=nFolds)
        for nStates in list(
                range(self.max_n_components, self.max_n_components + 1)):
            try:
                cv_score = 0
                if isSplit:
                    for train_idx, test_idx in split_method.split(
                            self.sequences):
                        train_X, train_lengths = combine_sequences(
                            train_idx, self.sequences)
                        test_X, test_lengths = combine_sequences(
                            test_idx, self.sequences)
                        cv_model = GaussianHMM(n_components=nStates,
                                               covariance_type="diag",
                                               n_iter=1000,
                                               random_state=self.random_state,
                                               verbose=False).fit(
                                                   train_X, train_lengths)
                        cv_score += cv_model.score(test_X, test_lengths)
                    avg_score = cv_score / nFolds
                else:
                    cv_model = GaussianHMM(n_components=nStates,
                                           covariance_type="diag",
                                           n_iter=1000,
                                           random_state=self.random_state,
                                           verbose=False).fit(
                                               self.X, self.lengths)
                    avg_score = cv_model.score(self.X, self.lengths)

                if avg_score > best_score:
                    best_score = avg_score
                    best_nStates = nStates
                    logging.debug(
                        "CV better score for {} with {} states".format(
                            self.this_word, nStates))
            except ValueError:
                logging.debug("CV ValueError on {} with {} states".format(
                    self.this_word, nStates))
                return self.base_model(nStates)
        return self.base_model(best_nStates)
Beispiel #32
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on DIC scores
        max_dic_score = float("-inf")
        best_model = None

        for n in range(self.min_n_components, self.max_n_components + 1):
            anti_probabilities = []
            try:
                model = GaussianHMM(n_components=n,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(self.X, self.lengths)
                log_l = model.score(self.X, self.lengths)
            except:
                continue
            for word in self.words:
                if word is not self.this_word:
                    try:
                        anti_model = GaussianHMM(
                            n_components=n,
                            covariance_type="diag",
                            n_iter=1000,
                            random_state=self.random_state,
                            verbose=False)
                        x, lengths = self.hwords[word]
                        anti_model.fit(x, lengths)
                        anti_probabilities.append(anti_model.score(x, lengths))
                    except:
                        continue

            dic_score = log_l - np.mean(anti_probabilities)
            if dic_score > max_dic_score:
                max_dic_score = dic_score
                best_model = self.base_model(n)

        return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_model = None
        best_score = float("-inf")
        for n in range(self.min_n_components, self.max_n_components + 1):

            if len(self.sequences) > 2:

                split_method = KFold(n_splits=min(3, len(self.sequences)))

                for train_index, test_index in split_method.split(
                        self.sequences):

                    logL = []
                    X_train, train_length = combine_sequences(
                        train_index, self.sequences)
                    X_test, test_length = combine_sequences(
                        test_index, self.sequences)

                    try:
                        hmm_model = GaussianHMM(n_components=n,
                                                covariance_type="diag",
                                                n_iter=1000,
                                                random_state=self.random_state,
                                                verbose=False).fit(
                                                    X_train, train_length)

                        logL.append(_model.score(X_test, test_length))

                    except:
                        pass

                if np.mean(logL) > best_score:
                    best_score = np.mean(logL)
                    best_model = hmm_model

        if not best_model:
            best_model = self.base_model(self.n_constant)

        return best_model
def UpdateHMM(beginDate, endDate):
    #beginDate = '20110401'
    #endDate = '20140401'
    data = DataAPI.MktIdxdGet(ticker='000001',
                              beginDate=beginDate,
                              endDate=endDate,
                              field=[
                                  'tradeDate', 'closeIndex', 'lowestIndex',
                                  'highestIndex', 'turnoverVol'
                              ],
                              pandas="1")  #1指数日行情数据
    data1 = DataAPI.FstTotalGet(exchangeCD=u"XSHE",
                                beginDate=beginDate,
                                endDate=endDate,
                                field=['tradeVal'],
                                pandas="1")  #1获取深圳交易所的融资融券数据
    data2 = DataAPI.FstTotalGet(exchangeCD=u"XSHG",
                                beginDate=beginDate,
                                endDate=endDate,
                                field=['tradeVal'],
                                pandas="1")
    tradeVal = data1 + data2  #1融资融券数据总和
    tradeDate = pd.to_datetime(data['tradeDate'][5:])  #日期列表
    volume = data['turnoverVol'][5:]  #2 成交量数据
    closeIndex = data['closeIndex']  # 3 收盘价数据
    deltaIndex = np.log(np.array(data['highestIndex'])) - np.log(
        np.array(data['lowestIndex']))  #3 当日对数高低价差
    deltaIndex = deltaIndex[5:]
    logReturn1 = np.array(np.diff(np.log(closeIndex)))  #4 对数收益率
    logReturn1 = logReturn1[4:]
    logReturn5 = np.log(np.array(closeIndex[5:])) - np.log(
        np.array(closeIndex[:-5]))  # 5日 对数收益差
    logReturnFst = np.array(np.diff(np.log(tradeVal['tradeVal'])))[4:]
    closeIndex = closeIndex[5:]
    X = np.column_stack(
        [logReturn1, logReturn5, deltaIndex, volume,
         logReturnFst])  # 将几个array合成一个2Darray
    # Make an HMM instance and execute fit
    model = GaussianHMM(n_components=3, covariance_type="diag",
                        n_iter=800).fit([X])
    return model
Beispiel #35
0
    def __init__(self, feature_metadata, minfraction, scale, kernelfn, tau=None, sigma=None, ncomponents=1, n_iter=1):
        super(CasmlApproximator, self).__init__()

        self._minfraction = minfraction
        self._scale = scale
        self._kernelfn = kernelfn
        self._new_sequence = True

        #: Contains all the existing CasmlAppoximations created by
        #: this CasmlApproximator. The keys serve as both queries and
        #: bases (queries are a superset of bases), so a datum may be
        #: None if the associated key is just a basis, not a query.
        self._queries = weakref.WeakValueDictionary()
        """:type: dict[tuple[MDPState, MDPAction], Approximation]"""
        #: The subset of keys of queries that are also bases.
        #: The order in which the bases have been received is preserved
        self._bases = set()
        """:type: set[tuple[MDPState, Hashable]"""
        self._fit_X = []
        """:type: list[ndarray]"""

        #: The case base maintaining the observations in the form
        #:     c = <s, a, ds>, where ds = s_{i+1} - s_i
        #: to identify possible successor states.
        self._basiscb = CaseBase(feature_metadata,
                                 retention_method=self._RetentionMethod,
                                 retention_method_params=(tau, sigma), name='basiscb')
        """:type: CaseBase"""
        del feature_metadata['delta_state']
        #: Invariant: contains all the keys in queries
        self._querycb = CaseBase(feature_metadata, name='querycb')
        """:type: CaseBase"""
        #: The hidden Markov model maintaining the observations in the form
        #:     seq = <s_{i}, s_{i+1}>
        #: to reason on the transition probabilities of successor states.
        self._hmm = GaussianHMM(ncomponents, n_iter=n_iter)  # , covariance_type='full'
        # self._hmm = GaussianHMM(ncomponents)
        """:type: GaussianHMM"""

        self._not_add_bases = 0
        self._not_add_count = 0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on DIC scores
        component_model_scores = pd.DataFrame({'components_num': [], 'mean_log_likelihood': []})
        i = 0
        for num_states in range(self.min_n_components,self.max_n_components+1):
            try:
                hmm_model = self.base_model(num_states)
                logP = hmm_model.score(self.X, self.lengths)
            except:
                continue
            # DIC = 0
            if hmm_model is not None:
                sumlogP = 0
                M = 0
                for word in self.hwords:
                    if word == self.this_word:
                        continue
                    try:
                        other_X, other_lengths = self.hwords[word]
                        # Increase M even if the model can't score for a word as we need to penalize the model
                        # in that case
                        logTemp = hmm_model.score(other_X,other_lengths)
                        sumlogP += logTemp
                        M += 1
                    except:
                        continue
            DIC = logP - sumlogP
            i += 1
            component_model_scores.loc[i] = [num_states, DIC]
        # Best model parameters
        try:
            best_num_states = \
                component_model_scores.ix[component_model_scores[['mean_log_likelihood']].idxmax()]['components_num']
            best_num_states = int(best_num_states)
            best_hmm_model = GaussianHMM(n_components=best_num_states, covariance_type="diag", n_iter=1000,
                                         random_state=self.random_state,verbose=False).fit(self.X, self.lengths)
            return best_hmm_model
        except:
            return None
Beispiel #37
0
    def select(self):

        # Use these variables to store best model
        bestDIC = None
        bestModel = None

        # Iterate over all possible models
        for num_states in range(self.min_n_components,
                                self.max_n_components + 1):

            try:
                # Create new Gaussian HMM
                hmm_model = GaussianHMM(n_components=num_states,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=self.verbose)

                if self.verbose:
                    print("model created for {} with {} states".format(
                        self.this_word, num_states))

                # Fit model with current data
                hmm_model.fit(self.X, self.lengths)

                # Calculate logL
                logL = hmm_model.score(self.X, self.lengths)

                otherScores = 0

                # Calculate likelihood SUM for all other words
                for otherWord in self.hwords:
                    if otherWord != self.this_word:
                        otherScores += hmm_model.score(*self.hwords[otherWord])

                # Caluclate dicusing formula DIC = log(P(X(i)) - 1/(M-1)SUM(log(P(X(all but i))
                dic = logL - (float(1) / (len(self.hwords) - 1)) * otherScores

                # Find model with highest DIC
                if bestDIC is None or dic > bestDIC:
                    bestModel = hmm_model
                    bestDIC = dic
            except:
                if self.verbose:
                    print("failure on {} with {} states".format(
                        self.this_word, num_states))

        return bestModel
Beispiel #38
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        best_model = GaussianHMM()
        best_score = float("inf")

        for n_components in range(self.min_n_components,
                                  self.max_n_components + 1):
            try:
                BIC, model = self.score(n_components)
                if (BIC < best_score):
                    best_score = BIC
                    best_model = model
            except:
                pass

        return best_model
Beispiel #39
0
def profileFeature(data, kmeans_hour, kmeans_day, labels_hour, labels_day,
                   temp8760):
    staticFeatures = [
        data.max(),
        data.min(),
        data.median(),
        data.mean(),
        data.std(),
        np.mean(np.fft.fft(data)),
        np.std(np.fft.fft(data)), kmeans_hour, kmeans_day
    ]
    n_hidden_states = 5
    hmm_hour = GaussianHMM(n_components=n_hidden_states)
    hmm_hour.fit(labels_hour.reshape(-1, 1))
    transmat_hour = hmm_hour.transmat_  # 转移特性矩阵
    entropy_hour = getEntropy(labels_hour)  # 行为信息熵
    hmm_day = GaussianHMM(n_components=n_hidden_states)
    hmm_day.fit(labels_day.reshape(-1, 1))
    transmat_day = hmm_day.transmat_  # 转移特性矩阵
    entropy_day = getEntropy(labels_day)  # 行为信息熵
    dynamicFeatures = [transmat_hour, entropy_hour, transmat_day, entropy_day]
    plotTempFeature(data, temp8760)
    return staticFeatures, dynamicFeatures
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float("-inf")
        best_model = None


        if len(self.sequences) < 2:
            return best_model

        kf = KFold(n_splits=self.splits())

        for index_components in range(self.min_n_components, self.max_n_components +1 ):
            summing_score = 0
            count = 0

            for cv_train, cv_test in kf.split(self.sequences):
                new_model = None
                try:
                    train_X, train_lengths = combine_sequences(cv_train, self.sequences)
                    test_X, test_lenghts = combine_sequences(cv_test, self.sequences)

                    new_model = GaussianHMM(n_components = index_components, covariance_type="diag", n_iter=1000,
                                            random_state=self.random_state, verbose=False).fit(train_X, train_lengths)

                    summing_score += hmm_model.score(test_X, test_lenghts)
                    count += 1
                except:
                    pass

            if count > 0:
                new_score = summing_score / count
            else:
                new_score = 0

            if new_score > best_score:
                best_score = new_score
                best_model = new_model

        return best_model
Beispiel #41
0
        def get_model_cv(self, num_of_state, split_method=None):

            l_list = []
            seqs = self.sequences

            # When sample size is too small to have fold return cv as logL
            if not split_method:
                try:
                    fullX, fulllengths = sequence_2_Xlengths(seqs)
                    m = GaussianHMM(n_components=num_of_state,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(fullX, fulllengths)
                    l = m.score(fullX, fulllengths)
                    return l
                except:
                    return float("-inf")

            for cv_train_idx, cv_test_idx in split_method.split(seqs):
                try:

                    train_sequences = [seqs[k] for k in cv_train_idx]
                    trainX, trainlengths = sequence_2_Xlengths(train_sequences)

                    test_sequences = [seqs[k] for k in cv_test_idx]
                    testX, testlengths = sequence_2_Xlengths(test_sequences)

                    m = GaussianHMM(n_components=num_of_state,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(trainX, trainlengths)

                    l = m.score(testX, testlengths)
                    l_list.append(l)

                except:
                    pass

            # Check the case if all model cannot score
            if len(l_list) == 0:
                cv = float("-inf")
            else:
                cv = np.mean(l_list)

            return cv
Beispiel #42
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # Variables to hold update scores
        best_bic = math.inf
        best_model = GaussianHMM()

        # Iterate across a range of model states
        for num_hidden_states in range(self.min_n_components,
                                       self.max_n_components + 1):

            try:
                # Fit a model based on state
                model = GaussianHMM(n_components=num_hidden_states, n_iter=100)
                model.fit(self.X, self.lengths)

                # Values needed to for BIC
                # From the slides: http://www2.imm.dtu.dk/courses/02433/doc/ch6_slides.pdf
                # BIC = −2 log L + p log N,

                # Get WITHIN sample logL
                logL = model.score(self.X, self.lengths)

                # Compute the number of parameters
                num_parameters = num_hidden_states * num_hidden_states + 2 * num_hidden_states * len(
                    self.X[0]) - 1

                # Compute overall BIC formula
                current_bic = (-2) * logL + num_parameters * math.log(
                    len(self.X))

                # Control flow to update BIC score
                if current_bic <= best_bic:
                    best_model, best_bic = model, current_bic

                else:
                    continue

            except:
                continue

        return best_model
def train_all(df):
    models = {}
    words = df["gesture"].unique()
    for word in words:
        dataword = df[df["gesture"] == word]
        speakers = dataword["speaker"].unique()
        lengths = []
        for speaker in speakers:
            lengths.append(len(dataword[dataword["speaker"] == speaker]))
        dataword = dataword.drop(columns=[
            dataword.columns[56], dataword.columns[57], dataword.columns[58]
        ])
        dataword = (dataword - dataword.min()) / (dataword.max() -
                                                  dataword.min())
        dataword = dataword.fillna(0.0)
        #BAYESIAN INFORMATION CRITERION FOR SELECTING THE BEST MODEL
        #best_score,best_model=float("inf"),None
        #print([word,len(dataword),lengths])
        models[word] = GaussianHMM(n_components=11,
                                   covariance_type="spherical",
                                   n_iter=1000).fit(dataword, lengths)
    return models
Beispiel #44
0
 def cv_model(self, num_states, training_X, training_lengths):
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     #training_X, training_lengths = combine_sequences(training_fold_idx,
     #                                                 self.X)
     try:
         hmm_model = GaussianHMM(n_components=num_states,
                                 covariance_type="diag",
                                 n_iter=1000,
                                 random_state=self.random_state,
                                 verbose=False).fit(training_X,
                                                    training_lengths)
         if self.verbose:
             print("training model created for {} with {} states based on\
                   dataset {}".format(self.this_word, num_states,
                                      training_X))
         return hmm_model
     except:
         if self.verbose:
             print("model creation failed for {} with {} states based on\
                   dataset {}".format(self.this_word, num_states,
                                      training_X))
         return None
    def fit(self, tr_seqs):
        self.tr_seqs = tr_seqs
        self.n_classes = len(tr_seqs)

        self.models = []
        for class_seqs in tr_seqs:
            lengths = [seq.shape[0] for seq in class_seqs]
            X = np.vstack(class_seqs)

            print X.shape, len(lengths)

            start_prob = np.ones(self.n_components)
            start_prob /= np.sum(start_prob)
            transmat = np.ones((self.n_components, self.n_components))
            for i in range(self.n_components):
                transmat[i, :] /= transmat[i, :].sum()

            trained = False
            with warnings.catch_warnings():
                warnings.filterwarnings('error')
                while not trained:
                    try:
                        model = GaussianHMM(n_components=self.n_components,
                                            covariance_type='diag', n_iter=50,
                                            startprob_prior=start_prob,
                                            transmat_prior=transmat)\
                                .fit(X, lengths)
                        trained = True
                    except RuntimeWarning as w:
                        print w
                        print start_prob
                        print transmat
                        print lengths
                        print X
                        start_prob = np.random.random(self.n_components)
                        transmat = np.random.random(
                            (self.n_components, self.n_components))

            self.models.append(model)
Beispiel #46
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        max_score = None
        max_model = None

        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                all_score = 0.0
                qty = 0
                final_model = None
                if (len(self.sequences) >= 2):
                    # Generate K folds
                    folds = min(len(self.sequences),3)
                    split_method = KFold(shuffle=True, n_splits=folds)
                    parts = split_method.split(self.sequences)
                    for cv_train_idx, cv_test_idx in parts:
                        # Kfold information for train
                        X_train, lengths_train = np.asarray(combine_sequences(cv_train_idx, self.sequences))
                        # Kfold information for test
                        X_test, lengths_test = np.asarray(combine_sequences(cv_test_idx, self.sequences))
                        # Fit model with train data
                        model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(X_train, lengths_train)
                        # Get score using test data
                        all_score = all_score+model.score(X_test,lengths_test)
                        qty = qty+1
                    # Calculate score
                    score = all_score / qty
                else:
                    # cant be fold
                    final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    score = model.score(self.X, self.lengths)
                # Keep model with best score
                if max_score is None or max_score < score:
                    max_score = score
                    if final_model is None:
                        final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                                  random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    max_model = final_model

            except:
                pass

        return max_model
Beispiel #47
0
def create_combined_hmm(model):
    list_pi = [model[appliance].startprob_ for appliance in model]
    list_A = [model[appliance].transmat_ for appliance in model]
    list_means = [model[appliance].means_.flatten().tolist()
                  for appliance in model]
				  
    pi_combined = compute_pi_fhmm(list_pi)
    A_combined = compute_A_fhmm(list_A)
    [mean_combined, cov_combined] = compute_means_fhmm(list_means)

    combined_model = GaussianHMM(n_components=len(pi_combined), covariance_type='full')
    combined_model.startprob_ = pi_combined
    combined_model.transmat_ = A_combined
    combined_model.covars_ = cov_combined
    combined_model.means_ = mean_combined
    
    return combined_model
 def select(self):
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     # TODO implement model selection based on DIC scores
     DIC = []  #track the DIC
     hidden_states = []  #track the number of hidden_states
     rest_words = list(self.words)  #list
     rest_words.remove(self.this_word)
     for num_hidden_states in range(
             self.min_n_components, self.max_n_components +
             1):  #for each possible number of hidden states
         try:  #if the hmmlearn library can train or score the model
             rest_logL = 0
             hmm_model = self.base_model(num_states=num_hidden_states)
             logL = hmm_model.score(self.X, self.lengths)
             rest_num_scorable_words = 0
             for word in rest_words:
                 X, lengths = self.hwords[word]
                 try:  #if the hmmlearn library can score the model
                     rest_logL = rest_logL + hmm_model.score(X, lengths)
                     rest_num_scorable_words = rest_num_scorable_words + 1
                 except:  #if the hmmlearn library cannot score the model
                     print('{0} is not scorable!'.format(word))
             DIC.append(logL - rest_logL / rest_num_scorable_words)
             hidden_states.append(num_hidden_states)
         except:  #if the hmmlearn library cannot train or score the model
             pass
     #now see which number of hidden states gave the largest DIC
     try:
         optimal_num_hidden_states = hidden_states[DIC.index(max(DIC))]
         optimal_hmm_model = GaussianHMM(
             n_components=optimal_num_hidden_states,
             covariance_type="diag",
             n_iter=1000,
             random_state=self.random_state,
             verbose=False).fit(self.X, self.lengths)
         return optimal_hmm_model
     except ValueError:  #if the hmmlearn library cannot train a single model for all possible number of hidden states
         pass
Beispiel #49
0
    def fit(self, data):
        """
        Estimates model parameters by initializing a Gaussian HMM for each class label and fitting data for that model

        :param data: matrix with the dimensions [number of datapoints][2][1 or 2]
        In the first matrix dimension, each datapoint will be stored. In the second dimension, at index 0, the veracity
        label of a given rumour will be stored. At index 1, the features will be stored. The third dimension will be of
        size 1 or 2, depending on whether only SDQC labels are used for the prediction, or timestamps are also included
        as features.
        :return: the HMM model, with sub-models fitted for each data label
        """
        classes = dict()

        feature_count = len(data[1][1][0])

        # partition data in labels
        for datapoint in data:
            if datapoint[0] not in classes:
                classes[datapoint[0]] = []
            classes[datapoint[0]].append(datapoint[1])

        # Make and fit model for each label
        for veracity_label, sdqc_labels in classes.items():
            lengths = [len(x) for x in sdqc_labels]
            thread_flat = np.array(flatten(sdqc_labels)).reshape(
                -1, feature_count)
            if veracity_label not in self.models:
                if self.model_type == 'gaussian':
                    self.models[veracity_label] = GaussianHMM(
                        n_components=self.components).fit(thread_flat,
                                                          lengths=lengths)
                elif self.model_type == 'multinomial':
                    # If timestamps are used, the MultinomialHMM ignores these, as it does not support float values
                    thread_flat = [[int(x[0])] for x in thread_flat]
                    self.models[veracity_label] = MultinomialHMM(
                        n_components=self.components).fit(thread_flat,
                                                          lengths=lengths)
        return self
Beispiel #50
0
def fitHMM(logAnnualQ_cut):
    # initialize matrices to store moments, transition probabilities,
    # stationary distribution and quantiles of Gaussian HMM for each site
    nSites = np.shape(logAnnualQ_cut)[1]
    mus = np.zeros([2, nSites])
    sigmas = np.zeros([2, nSites])
    P = np.zeros([2, 2, nSites])
    pi = np.zeros([2, nSites])

    for i in range(np.shape(logAnnualQ_cut)[1]):
        # fit to last 2/3 of historical record
        hmm_model = GaussianHMM(n_components=2, n_iter=1000).fit(
            np.reshape(logAnnualQ_cut[35::, i],
                       [len(logAnnualQ_cut[35::, i]), 1]))

        # find means (mus) and standard deviations (sigmas) of Gaussian mixture distributions
        mus[:, i] = np.reshape(hmm_model.means_, hmm_model.means_.size)
        sigmas[:, i] = np.reshape(
            np.sqrt(
                np.array([
                    np.diag(hmm_model.covars_[0]),
                    np.diag(hmm_model.covars_[1])
                ])), hmm_model.means_.size)

        # find transition probabilities, P
        P[:, :, i] = hmm_model.transmat_

        if mus[0, i] > mus[1, i]:
            mus[:, i] = np.flipud(mus[:, i])
            sigmas[:, i] = np.flipud(sigmas[:, i])
            P[:, :, i] = np.fliplr(np.flipud(P[:, :, i]))

        # find stationary distribution, pi
        eigenvals, eigenvecs = np.linalg.eig(np.transpose(P[:, :, i]))
        one_eigval = np.argmin(np.abs(eigenvals - 1))
        pi[:, i] = eigenvecs[:, one_eigval] / np.sum(eigenvecs[:, one_eigval])

    return mus, sigmas, P, pi
Beispiel #51
0
 def select(self):
     
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     best_score = float('-inf')
     average_score = float('-inf')
     best_model = None
     for n_components in range(self.min_n_components, self.max_n_components + 1):
         #going into K-folds
         #1. define kfold
         #2. define model
         #3. fit model with train, score with testing, record scores
         inside_scores = []
         if len(self.lengths) <= 2:
             #GaussianHMM takes in a numpy array and a list
             #print('short')
             try:
                 #print(n_components, self.this_word, 'with length: ', len(self.lengths))
                 model = GaussianHMM(n_components = n_components, covariance_type = 'diag', n_iter = 1000,
                                     verbose = self.verbose, random_state = self.random_state).fit(self.X, self.lengths)
                 inside_scores.append(model.score(self.X, self.lengths))
             except:
                 print('some error with ', self.this_word)
             
         else:
             #print('long, kfold')
             kf = KFold()
             for train, test in kf.split(self.sequences):
                 x_train, length_train = combine_sequences(train, self.sequences)
                 x_test, length_test = combine_sequences(test, self.sequences)
                 #GaussianHMM takes in a numpy array and a list
                 try:
                     #print(n_components, self.this_word, 'with length: ', len(self.lengths))
                     model = GaussianHMM(n_components = n_components, covariance_type = 'diag', n_iter = 1000,
                                         verbose = self.verbose, random_state = self.random_state).fit(x_train, length_train)
                     inside_scores.append(model.score(x_test, length_test))
                 except:
                     #print('some error with ', self.this_word)
                     pass
                 
         average_score = np.mean(inside_scores)
         if average_score > best_score:
             best_model = model
             best_score = average_score
     #print('best score: ',best_score)
     return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        # having the default score and model at the beginning
        dic_score = -99999999999
        saved_model = self.base_model(self.n_constant)
        # iterating through min to max number of components to find the best one
        for i in range(self.min_n_components, self.max_n_components + 1):
            try:
                # getting the model
                hmm_model = GaussianHMM(n_components=i,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False).fit(
                                            self.X, self.lengths)
                # getting the log likelihood of the current example
                logL_i = hmm_model.score(self.X, self.lengths)
            except:
                # if fails, will continue to next word
                continue
            logL_rest = 0
            # will get the model of all other examples to calculate dic score
            for key in self.words:
                if key == self.this_word:
                    continue
                X_temp, lengths_temp = self.hwords[key]
                try:
                    hmm_model_temp = GaussianHMM(
                        n_components=i,
                        covariance_type="diag",
                        n_iter=1000,
                        random_state=self.random_state,
                        verbose=False).fit(X_temp, lengths_temp)
                    # accumulating log likelihood of all examples
                    logL_rest += hmm_model_temp.score(X_temp, lengths_temp)
                except:
                    continue
            coeff = 1 / (len(self.words) - 1)
            dic_current = logL_i - coeff * logL_rest
            # comparing for the best score
            if dic_current > dic_score:
                saved_model = hmm_model
                dic_score = dic_current

        return saved_model
Beispiel #53
0
class HMMGoalModel(object):
    def __init__(self, per_data, per_lens=None, n_states=None):
        if per_lens is None:
            per_lens = list(map(len, per_data))

        if len(per_data.shape) > 2:
            per_data = per_data.reshape(-1, per_data.shape[-1])

        if n_states is None:
            components = [2, 4, 6, 8, 10]

            hmms = [GaussianHMM(n_components=c) for c in components]

            map(lambda g: g.fit(per_data, per_lens), hmms)
            scores = map(lambda g: aic(g, per_data, per_lens), hmms)

            max_score, self.hmm = sorted(zip(scores, hmms))[0]
        else:
            self.hmm = GaussianHMM(n_components=n_states)
            self.hmm.fit(per_data, per_lens)

        ll = self.hmm.score(per_data, per_lens)
        print "Goal HMM n_components", self.hmm.n_components, "Log likelihood", ll

        upper_idxs = [per_lens[0] - 1]
        start_idxs = [0]
        for i in range(1, len(per_lens)):
            upper_idxs.append(upper_idxs[i - 1] + per_lens[i])
            start_idxs.append(start_idxs[i - 1] + per_lens[i - 1])

        self.final_states = np.array(self.hmm.predict(per_data,
                                                      per_lens))[upper_idxs]
        print self.final_states
        self.T = int(np.mean(per_lens))
        self.n_components = self.hmm.n_components

    def is_success(self, per_trj):
        per_trj = np.array(per_trj)
        states = self.hmm.predict(per_trj)
        final_state = states[-1]
        return final_state in self.final_states

    def sample(self, t=None):
        t = self.T if t is None else t
        return self.hmm.sample(t)
Beispiel #54
0
 def base_model(self, num_states):
     # with warnings.catch_warnings():
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     # warnings.filterwarnings("ignore", category=RuntimeWarning)
     #covariance_type = "full"
     covariance_type = "diag"
     #if(len(self.sequences) < 3): # GaussianHMM value error issues if this is less than 3
     #    covariance_type = 'diag'
     try:
         hmm_model = GaussianHMM(n_components=num_states,
                                 covariance_type=covariance_type,
                                 n_iter=1000,
                                 random_state=self.random_state,
                                 verbose=False).fit(self.X, self.lengths)
         if self.verbose:
             print("model created for {} with {} states".format(
                 self.this_word, num_states))
         return hmm_model
     except:
         if self.verbose:
             print("failure on {} with {} states".format(
                 self.this_word, num_states))
         return None
Beispiel #55
0
import numpy as np
import matplotlib.pyplot as plt
from hmmlearn.hmm import GaussianHMM

# 从输入文件中加载数据
input_file = 'CNY.csv'
data = np.loadtxt(input_file, delimiter=',')

# 提取需要的值
closing_values = np.array(data[:, 6])
volume_of_shares = np.array(data[:, 8])[:-1]

# 计算每天收盘价变化率
diff_percentage = 100.0 * np.diff(closing_values) / closing_values[:-1]

# 将变化率与交易量组合起来
X = np.column_stack((diff_percentage, volume_of_shares))

# 创建并训练高斯隐马尔科夫模型
print(u"训练高斯隐马尔科夫模型中......")
model = GaussianHMM(n_components=5, covariance_type='diag', n_iter=1000)
model.fit(X)

# 用模型生成数据
num_samples = 500
samples, _ = model.sample(num_samples)
plt.plot(np.arange(num_samples), samples[:, 0], c='black')
plt.figure()
plt.plot(np.arange(num_samples), samples[:, 1], c='red')
plt.show()
Beispiel #56
0
feature_test = test_df[feature_name]
activity_test = test_df[activity_name]

data_feature = feature_data.as_matrix()
data_label = activity_data.as_matrix()

test_feature = feature_test.as_matrix()
test_label = activity_test.as_matrix()

lengths = data_feature.shape[0]

# --- Run Gaussian HMM --- #
print "fitting to HMM and decoding ..."

# --- Make an HMM instance and execute fit --- #
model = GaussianHMM(n_components=5, covariance_type="diag", n_iter=1000).fit(data_feature)

# --- Predict the optimal sequence of internal hidden state FOR DATA CSV!--- #
# --- the following is generating figure #1, and it predicts state sequence from DATA csv --- #
hidden_states = model.predict(data_feature)

time_axis = np.asarray(range(len(hidden_states)))

# --- fancy plots of different states in HMM --- #
fig1_data,axs = plt.subplots(model.n_components, sharex=True, sharey=True)
fig1_data.suptitle('Estimated State Sequence for Training Data')
colours = cm.rainbow(np.linspace(0, 1, model.n_components))
for i, (ax, colour) in enumerate(zip(axs, colours)):
	# --- Use fancy indexing to plot data in each state --- #
	mask = hidden_states == i
	ax.plot(time_axis[mask], data_feature[:,1][mask], ".", c=colour)
Beispiel #57
0
import matplotlib.pyplot as plt 
from hmmlearn.hmm import GaussianHMM

from convert_to_timeseries import convert_data_to_timeseries

# 从输入文件中加载数据
input_file = 'data_hmm.txt'
data = np.loadtxt(input_file, delimiter=',')

# 排列训练数据
X =np.column_stack([data[:, 2]])

# 创建并训练高斯HMM模型
print(u"训练高斯HMM模型")
num_components = 4
model = GaussianHMM(n_components=num_components,
                                         covariance_type='diag',n_iter=1000)
model.fit(X)

# 预测HMM的隐藏状态
hidden_states = model.predict(X)

# 计算这些隐藏状态的均值和方差
print(u"隐藏状态的均值和方差")
for i in range(model.n_components):
    print(u"隐藏状态:{}".format(i+1))
    print(u"均值:{:.3f}".format(model.means_[i][0]))
    print(u"方差:{:.3f}".format(np.diag(model.covars_[i])[0]))

# 用模型生成数据
num_samples  = 1000
samples, _ = model.sample(num_samples)
Beispiel #58
0
class CasmlApproximator(FunctionApproximator):
    """

    """

    class _RetentionMethod(RetentionMethod):
        """The retention method for the transition case base implementation for :class:`Casml`.

        When the new problem-solving experience can be stored or not stored in memory,
        depending on the revision outcomes and the CBR policy regarding case retention.

        Parameters
        ----------
        owner : CaseBase
            A pointer to the owning case base.
        tau : float, optional
            The maximum permitted error when comparing most similar solution.
            Default is 0.8.
        sigma : float, optional
            The maximum permitted error when comparing actual with estimated
            transitions. Default is 0.2
        plot_retention_method : callable, optional
            Callback function plotting the retention step. Default is None.

        Notes
        -----
        The Casml retention method for the transition case base considers query cases as
        predicted correctly if both:

        1. the difference between the actual and the estimated transitions are less
           than or equal to the permitted error :math:`\\sigma`:

           .. math::

              d(\\text{case}.\\Delta_\\text{state}, T(s_{i-1}, a_{i-1}) <= \\sigma

        2. and the query case is within the maximum permitted error :math:`\\tau` of
           the most similar solution case:

           .. math::

              d(\\text{case}, 1\\text{NN}(C_T, \\text{case})) <= \\tau

        """

        def __init__(self, owner, tau=None, sigma=None, plot_retention_method=None):
            super(CasmlApproximator._RetentionMethod, self).__init__(owner, plot_retention_method,
                                                                     {'tau': tau, 'sigma': sigma})

            self._tau = tau if tau is not None else 0.8
            """:type: float"""

            self._sigma = sigma if sigma is not None else 0.2
            """:type: float"""

        def execute(self, features, matches, plot=True):
            """Execute the retention step.

            Parameters
            ----------
            features : list[tuple[str, ndarray]]
                A list of features of the form (`feature_name`, `data_points`).
            matches : dict[str, dict[int, tuple[float, ndarray]]]
                The solution identified through the similarity measure.
            plot: bool, optional
                Plot the data during the retention step.

            Returns
            -------
        int :
            The case id if the case was retained, -1 otherwise.

            """
            f = dict(features)

            do_add = True
            if matches:
                for id_, val in matches['state'].iteritems():
                    delta_error = np.linalg.norm(self._owner.get_feature('delta_state', id_).value - f['delta_state'])
                    if delta_error <= self._sigma:
                        # At least one of the cases in the case base correctly estimated the query case,
                        # the query case does not add any new information, do not add.
                        do_add = False
                        break

            basis_id = -1
            if do_add or matches['state'].values()[0][0] > self._tau:
                basis_id = self._owner.insert(features, matches)

            if plot:
                self.plot_data(features, matches)

            return basis_id

    class Approximation(FunctionApproximator.Approximation):
        """

        """

        def __init__(self, approximator, state, act, kernelfn):
            super(CasmlApproximator.Approximation, self).__init__(state)

            self._act = act

            self._approximator = approximator
            """:type: CasmlApproximator"""

            self._kernelfn = kernelfn
            self._sum = 0.0

            self._neighbors = []
            """:type: list"""
            self._deltas = []
            """:type: list"""

            self.update(state.features, act.features)

        def __del__(self):
            assert (self.state, Hashable(self._act.features)) not in self._approximator._queries
            # noinspection PyTypeChecker
            # if not next((True for elem in self._approximator._fit_X if np.all(elem == self.state.features)), False):
            if (self.state, Hashable(self._act.features)) not in self._approximator._bases:
                self._approximator._querycb.remove([('state', self.state.features), ('act', self._act.features)])

        def include(self, d, state, delta):
            assert d >= 0

            val = (d, state)
            if len(self._neighbors) <= 0 or val < self._neighbors[-1]:
                # noinspection PyTypeChecker
                # if not next((True for (dist, v) in self._neighbors if dist == d and np.all(v == state)), False):
                ind = bisect.bisect_left(self._neighbors, val)
                bisect.insort(self._neighbors, val)
                self._deltas.insert(ind, delta)
                self._compute_weights()
                self.dispatch('average_change')
            else:
                assert self._sum > 0.0
                w = self._kernelfn(d)
                if w / self._sum >= self._approximator._minfraction:
                    self._neighbors.append(val)
                    self._deltas.append(delta)
                    self._compute_weights()
                    self.dispatch('average_change')

        def update(self, state, act):
            neighbors = dict(self._approximator._basiscb.retrieve([('state', state), ('act', act)]))
            if 'state' in neighbors:
                self._deltas = [self._approximator._basiscb.get_feature('delta_state', id_).value for id_ in
                                neighbors['state'].iterkeys()]
                self._neighbors = neighbors['state'].values()

            self._compute_weights()

        def _compute_weights(self):
            self._weights.clear()
            self._sum = 0.0

            i = 0
            total = 0

            # calculate successor states from the current state and solution delta state
            for (d, succ), delta in zip(self._neighbors, self._deltas):
                w = self._kernelfn(d)
                if self._sum == 0.0 or w / self._sum >= self._approximator._minfraction:
                    sequence = [np.asarray(self._state.features), np.asarray(self._state.features + delta)]

                    proba = np.exp(self._approximator._hmm.score(sequence))
                    self._weights[MDPState.create(succ)] = (w, proba)       # proba
                    self._sum += w
                    total += proba
                    i += 1
                else:
                    break
            del self._neighbors[i:]
            del self._deltas[i:]

            for succ, (w, p) in self._weights.iteritems():
                self._weights[succ] = (w, p / total)        # total
            pass

            # sequences = np.zeros((len(self._neighbors), 2, len(self._state)), dtype=float)
            #
            # for i, delta in enumerate(self._deltas):
            #     sequences[i, 0] = np.array(self._state.features)
            #     sequences[i, 1] = np.asarray(self._state.features + delta)
            #
            # # use HMM to calculate probability for observing sequence <current_state, next_state>
            # # noinspection PyTypeChecker
            # weights = np.exp(self._approximator._hmm.score(sequences))
            # for (_, succ), w in zip(self._neighbors, weights):
            #     self._weights[MDPState.create(succ)] = w
            #
            # sum_ = weights.sum()
            # for (_, succ), w in zip(self._neighbors, weights):
            #     if len(weights) <= 1:
            #         w *= 0.9
            #     self._weights[MDPState.create(succ)] = w / sum_

    # -----------------------------
    # CasmlApproximator
    # -----------------------------
    def __init__(self, feature_metadata, minfraction, scale, kernelfn, tau=None, sigma=None, ncomponents=1, n_iter=1):
        super(CasmlApproximator, self).__init__()

        self._minfraction = minfraction
        self._scale = scale
        self._kernelfn = kernelfn
        self._new_sequence = True

        #: Contains all the existing CasmlAppoximations created by
        #: this CasmlApproximator. The keys serve as both queries and
        #: bases (queries are a superset of bases), so a datum may be
        #: None if the associated key is just a basis, not a query.
        self._queries = weakref.WeakValueDictionary()
        """:type: dict[tuple[MDPState, MDPAction], Approximation]"""
        #: The subset of keys of queries that are also bases.
        #: The order in which the bases have been received is preserved
        self._bases = set()
        """:type: set[tuple[MDPState, Hashable]"""
        self._fit_X = []
        """:type: list[ndarray]"""

        #: The case base maintaining the observations in the form
        #:     c = <s, a, ds>, where ds = s_{i+1} - s_i
        #: to identify possible successor states.
        self._basiscb = CaseBase(feature_metadata,
                                 retention_method=self._RetentionMethod,
                                 retention_method_params=(tau, sigma), name='basiscb')
        """:type: CaseBase"""
        del feature_metadata['delta_state']
        #: Invariant: contains all the keys in queries
        self._querycb = CaseBase(feature_metadata, name='querycb')
        """:type: CaseBase"""
        #: The hidden Markov model maintaining the observations in the form
        #:     seq = <s_{i}, s_{i+1}>
        #: to reason on the transition probabilities of successor states.
        self._hmm = GaussianHMM(ncomponents, n_iter=n_iter)  # , covariance_type='full'
        # self._hmm = GaussianHMM(ncomponents)
        """:type: GaussianHMM"""

        self._not_add_bases = 0
        self._not_add_count = 0

    def initialize(self):
        """Prepare for a new episode."""
        self._new_sequence = True

    def add_basis(self, state, act, succ=None):
        """Adds a state to the set of bases used to approximate query
        states.

        Parameters
        ----------
        state : MDPState
            The state to add
        act : MDPAction
            The action performed in that state
        succ : MDPState:
            The successor state.

        Returns
        -------
        MDPState :
            The approximated state.

        """
        # update the hmm with the new sequence
        self._fit_hmm(state, succ)

        # retain the case in the query case base
        features = [('state', state.features), ('act', act.features)]
        self._querycb.retain(features)

        a = Hashable(act.features)
        if (state, a) in self._bases:
            self._not_add_bases += 1
            return state

        self._bases.add((state, a))

        # retain the case in the basis case base
        if succ is None:
            succ = state
        delta = succ - state
        features.append(('delta_state', delta))
        basis_id = self._basiscb.run(features)

        if basis_id <= -1:
            self._not_add_count += 1

        if basis_id >= 0:
            if self._querycb.similarity_uses_knn:
                for c in self._querycb.itervalues():
                    try:
                        approx = self._queries[(MDPState.create(c['state'].value), Hashable(c['act'].value))]
                    except KeyError:
                        pass
                    else:
                        approx.update(c['state'].value, c['act'].value)
            else:
                neighbors = dict(self._querycb.retrieve([('state', state.features), ('act', act.features)]))
                for id_, (d, s) in neighbors['state'].iteritems():
                    try:
                        approx = self._queries[(MDPState.create(s), Hashable(neighbors['act'][id_][1]))]
                    except KeyError:
                        pass
                    else:
                        approx.include(d, state.features, delta)

        return state

    def approximate(self, state, act):
        """Approximates a given state using an Approximation.

        Parameters
        ----------
        state : MDPState
            The state to approximate.
        act : MDPAction
            The action performed in that state

        Returns
        -------
        Approximation :
            The Approximation approximating state.

        """
        self._querycb.retain([('state', state.features), ('act', act.features)])

        a = Hashable(act.features)
        try:
            approx = self._queries[(state, a)]
        except KeyError:
            approx = CasmlApproximator.Approximation(self, state, act, self._kernelfn)
            self._queries[(state, a)] = approx
        return approx

    def _fit_hmm(self, state, succ):
        # try:
        #     x = self._hmm._fit_X.copy()
        # except AttributeError:
        #     x = np.zeros(1, dtype=np.object)
        # else:
        #     if self._new_sequence:
        #         x = self._hmm._fit_X.tolist()
        #         x.append(np.zeros(1))
        #         x = np.array(x)
        #
        # if self._new_sequence:
        #     self._new_sequence = False
        #     x[-1] = np.hstack([np.reshape(state.features, (-1, state._nfeatures)).T])
        #
        # x[-1] = np.hstack([x[-1].tolist(), np.reshape(succ.features, (-1, succ._nfeatures)).T])
        # self._hmm.fit(x, n_init=1)

        if self._new_sequence:
            self._new_sequence = False
            self._fit_X.append([])
            self._fit_X[-1].append(state.features)

        if succ is not None:
            self._fit_X[-1].append(succ.features)
            self._hmm.fit(np.concatenate(self._fit_X), lengths=[len(x) for x in self._fit_X])
# Take diff of close value. Note that this makes
# ``len(diff) = len(close_t) - 1``, therefore, other quantities also
# need to be shifted by 1.
diff = np.diff(close_v)
dates = dates[1:]
close_v = close_v[1:]

# Pack diff and volume for training.
X = np.column_stack([diff, volume])

###############################################################################
# Run Gaussian HMM
print("fitting to HMM and decoding ...", end="")

# Make an HMM instance and execute fit
model = GaussianHMM(n_components=4, covariance_type="diag", n_iter=1000).fit(X)

# Predict the optimal sequence of internal hidden state
hidden_states = model.predict(X)

print("done")

###############################################################################
# Print trained parameters and plot
print("Transition matrix")
print(model.transmat_)
print()

print("Means and vars of each hidden state")
for i in range(model.n_components):
    print("{0}th hidden state".format(i))