Ejemplo n.º 1
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on DIC scores
        # raise NotImplementedError
        record = float("-inf")

        min_seq = min([len(seq) for seq in self.sequences])    
        self.max_n_components = min (self.max_n_components, min_seq) 

        hmm_model = self.base_model(self.n_constant)
        for num in range(self.min_n_components,self.max_n_components+1,1):
            #print(num)
            try: 
                model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths)
                logL = model.score(self.X, self.lengths)
                tmp = 0
                for word in self.hwords:
                    X, lengths = self.hwords[word]
                    tmp += model.score(X,lengths)
                DIC = logL - (tmp-logL) /(len(self.hwords)-1)   
                if DIC > record:
                    record = DIC
                    hmm_model = model   
            except:
                continue
                # print("failure on {} with {} states".format(self.this_word, num))          
        return hmm_model
Ejemplo n.º 2
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float("-inf")
        best_model = None
        for n in range(self.min_n_components, self.max_n_components+1):
            try:
                other_words_score = 0.0
                quantity = 0.0
                model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                this_word_score = model.score(self.X, self.lengths)
                for word in self.hwords:
                    if word != self.this_word:
                        quantity += 1
                        X, lengths = self.hwords[word]
                        other_words_score += model.score(X, lengths)
                # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10
                score = this_word_score - other_words_score / quantity
                if score > best_score:
                    best_score = score
                    best_model = model
            except:
                continue
        return best_model
Ejemplo n.º 3
0
def get_best_hmm_model(X, max_states, max_iter = 10000):
    best_score = -(10 ** 10)
    best_state = 0
    
    for state in range(1, max_states + 1):
        hmm_model = GaussianHMM(n_components = state, random_state = 100,
                                covariance_type = "diag", n_iter = max_iter).fit(X)
        if hmm_model.score(X) > best_score:
            best_score = hmm_model.score(X)
            best_state = state
    
    best_model = GaussianHMM(n_components = best_state, random_state = 100,
                                covariance_type = "diag", n_iter = max_iter).fit(X)
    return best_model
Ejemplo n.º 4
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        n_components = range(self.min_n_components, self.max_n_components + 1)
        best_n = self.random_state
        best_DIC = float('-inf')
        best_model = None

        for n in n_components:
            try:
                w_count = 0
                model = GaussianHMM(n, n_iter=1000).fit(self.X, self.lengths)
                original_prob = model.score(self.X, self.lengths)

                sum_prob_others = 0.0

                for word in self.words:
                    if word == self.this_word:
                        continue

                    X_other, lengths_other = self.hwords[word]
                    #other_model = GaussianHMM(n,n_iter=1000).fit(X_other,lengths_other) #Edited-commented
                    logL = model.score(X_other, lengths_other)
                    sum_prob_others += logL
                    w_count = w_count + 1

                avg_prob_others = sum_prob_others / w_count
                DIC = original_prob - avg_prob_others
                #print('num_comp: {} for DIC:'.format(n,DIC))

                if DIC > best_DIC:
                    best_DIC = DIC
                    best_n = n
            except:
                pass

        #if (len(self.lengths) == 1):
        #    #and (self.lengths[0] <= (len(self.X))/2):
        #    print ("length is equal. Can we process it? " + "length <= " + str(self.lengths[0]) + " X " + str(len(self.X)/2) )
        #    print (self.X, self.lengths)
        try:
            best_model = GaussianHMM(best_n,
                                     n_iter=1000).fit(self.X, self.lengths)
        except ValueError:
            print("length is equal. Can we process it? " + "length <= " +
                  str(self.lengths[0]) + " X " + str(len(self.X)))
            #print ("Clusters is " + best_model.n_components)
            print(self.X, self.lengths)
            best_model = None
        print("Exiting...")
        return best_model
Ejemplo n.º 5
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score, best_n_components = None, None

        for n_components in range(self.min_n_components,
                                  self.max_n_components + 1):
            scores, n_splits = [], 3
            if (len(self.sequences) < 3):
                try:
                    model = GaussianHMM(n_components=n_components,
                                        n_iter=1000).fit(self.X, self.lengths)
                    logL = model.score(self.X, self.lengths)
                    if (best_score is None or logL > best_score):
                        best_score, best_n_components = logL, n_components
                except Exception as e:
                    # Skip cross-validation for current n_components
                    continue
            else:
                split_method = KFold(random_state=self.random_state,
                                     n_splits=n_splits)

                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    X_train, lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    X_test, lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    try:
                        model = GaussianHMM(n_components=n_components,
                                            n_iter=1000).fit(
                                                X_train, lengths_train)
                        logL = model.score(X_test, lengths_test)
                        scores.append(logL)
                    except Exception as e:
                        break

                training_successful = len(scores) == n_splits

                if (not training_successful): continue

                avg = np.average(scores)
                if (best_score is None or avg > best_score):
                    best_score, best_n_components = avg, n_components

        if (best_score == None):
            best_n_components = 3

        return self.base_model(best_n_components)
Ejemplo n.º 6
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float('-inf')
        if len(self.sequences) == 1:
            isSplit = False
        else:
            isSplit = True
            nFolds = min(3, len(self.sequences))
            split_method = KFold(n_splits=nFolds)
        for nStates in list(
                range(self.max_n_components, self.max_n_components + 1)):
            try:
                cv_score = 0
                if isSplit:
                    for train_idx, test_idx in split_method.split(
                            self.sequences):
                        train_X, train_lengths = combine_sequences(
                            train_idx, self.sequences)
                        test_X, test_lengths = combine_sequences(
                            test_idx, self.sequences)
                        cv_model = GaussianHMM(n_components=nStates,
                                               covariance_type="diag",
                                               n_iter=1000,
                                               random_state=self.random_state,
                                               verbose=False).fit(
                                                   train_X, train_lengths)
                        cv_score += cv_model.score(test_X, test_lengths)
                    avg_score = cv_score / nFolds
                else:
                    cv_model = GaussianHMM(n_components=nStates,
                                           covariance_type="diag",
                                           n_iter=1000,
                                           random_state=self.random_state,
                                           verbose=False).fit(
                                               self.X, self.lengths)
                    avg_score = cv_model.score(self.X, self.lengths)

                if avg_score > best_score:
                    best_score = avg_score
                    best_nStates = nStates
                    logging.debug(
                        "CV better score for {} with {} states".format(
                            self.this_word, nStates))
            except ValueError:
                logging.debug("CV ValueError on {} with {} states".format(
                    self.this_word, nStates))
                return self.base_model(nStates)
        return self.base_model(best_nStates)
Ejemplo n.º 7
0
def get_best_hmm_model(X, max_iter=10000, max_states=6):
    best_score = -(10 ** 10)
    best_state = 0
    for state in range(1, max_states + 1):
        hmm_model = GaussianHMM(n_components=state, random_state=100, covariance_type='diag',
                                n_iter=max_iter).fit(X)
        try:
            if hmm_model.score(X) > best_score:
                best_score = hmm_model.score(X)
                best_state = state
        except:
            continue
    best_model = GaussianHMM(n_components=best_state, random_state=100, covariance_type='diag',
                             n_iter=max_iter).fit(X)
    return best_model
Ejemplo n.º 8
0
    def select(self):

        # Use these variables to store best model
        bestDIC = None
        bestModel = None

        # Iterate over all possible models
        for num_states in range(self.min_n_components,
                                self.max_n_components + 1):

            try:
                # Create new Gaussian HMM
                hmm_model = GaussianHMM(n_components=num_states,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=self.verbose)

                if self.verbose:
                    print("model created for {} with {} states".format(
                        self.this_word, num_states))

                # Fit model with current data
                hmm_model.fit(self.X, self.lengths)

                # Calculate logL
                logL = hmm_model.score(self.X, self.lengths)

                otherScores = 0

                # Calculate likelihood SUM for all other words
                for otherWord in self.hwords:
                    if otherWord != self.this_word:
                        otherScores += hmm_model.score(*self.hwords[otherWord])

                # Caluclate dicusing formula DIC = log(P(X(i)) - 1/(M-1)SUM(log(P(X(all but i))
                dic = logL - (float(1) / (len(self.hwords) - 1)) * otherScores

                # Find model with highest DIC
                if bestDIC is None or dic > bestDIC:
                    bestModel = hmm_model
                    bestDIC = dic
            except:
                if self.verbose:
                    print("failure on {} with {} states".format(
                        self.this_word, num_states))

        return bestModel
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        this_word = self.this_word
        sequences = self.sequences

        hmm_models = {}
        best_score = None
        best_i = None

        for i in range(self.min_n_components, self.max_n_components):
            try:
                model = GaussianHMM(n_components=i,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(self.X, self.lengths)
                hmm_models[i] = model
                logL = model.score(self.X, self.lengths)

                sum_logL = 0
                num_of_logs = 0

                for word in self.hwords.keys():
                    if word == this_word:
                        continue

                    try:
                        X2, lengths2 = self.hwords[word]
                        logL = model.score(X2, lengths2) / len(lengths2)
                        sum_logL += logL
                        num_of_logs += 1
                    except:
                        pass

                dic = logL
                if num_of_logs:
                    dic -= sum_logL / num_of_logs

                if best_score is None or best_score < dic:
                    best_score = dic
                    best_i = i
            except ValueError:
                hmm_models[i] = None

        if best_i is None:
            return None
        return hmm_models[best_i]
Ejemplo n.º 10
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on BIC scores
        # raise NotImplementedError
        record = float("inf")

        min_seq = min([len(seq) for seq in self.sequences])    
        self.max_n_components = min (self.max_n_components, min_seq) 

        hmm_model = self.base_model(self.n_constant)
        for num in range(self.min_n_components,self.max_n_components+1,1):
            #print(num)
            try: 
                model = GaussianHMM(n_components= num, n_iter=1000).fit(self.X, self.lengths)
                logL = model.score(self.X, self.lengths)
                # p is the number of free parameters, N is the number of data points
                p = num*num + 2* num* len(self.X[0]) -1
                BIC = -2* logL + p * np.log(len(self.X))
                if BIC < record:
                    record = BIC
                    hmm_model = model  
            except:
                continue
                # print("failure on {} with {} states".format(self.this_word, num))         
        return hmm_model
Ejemplo n.º 11
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float('inf')
        best_model = None
        for n in range(self.min_n_components, self.max_n_components+1):
            try:
                model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                logL = model.score(self.X, self.lengths)
                # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10
                p = n ** 2 + 2 * n * len(self.X[0])  - 1
                N = len(self.X)
                score = -2 * logL + p * np.log(N)
                if score < best_score:
                    best_score = score
                    best_model = model
            except:
                continue
        return best_model
Ejemplo n.º 12
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV

        # best_model = None
        best_score = float('-inf')
        best_n = self.max_n_components

        split_method = KFold()
        scores = []

        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    train_X, train_lengths = combine_sequences(
                        cv_train_idx, self.sequences)
                    test_X, test_lengths = combine_sequences(
                        cv_test_idx, self.sequences)

                    model = GaussianHMM(n_components=n).fit(
                        train_X, train_lengths)

                    score = model.score(test_X, test_lengths)
                    scores.append(score)
                avg_score = np.mean(scores)
                if avg_score > best_score:
                    best_score = avg_score
                    best_n = n

            except:
                pass

        return self.base_model(best_n)
Ejemplo n.º 13
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components
        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        # initialize variables
        hmm_model = None
        best_hmm_model = None
        feature_cnt = self.X.shape[1]
        best_b_i_c__score = float("inf")

        for num_states in range(self.min_n_components, self.max_n_components + 1):

            try:

                # train a model based on current number of components = num_states
                hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                # calculate likelihood log  for the model
                log_l = hmm_model.score(self.X, self.lengths)
                # number of parameter
                p = num_states * (num_states + feature_cnt * 2 - 1)
                log_n = np.log(len(self.X))
                # Calculate BIC score using the model parameters
                b_i_c__score = -2 * log_l + p * log_n
            except:
                b_i_c__score = float("inf")
                best_hmm_model = None
            # choose the best model
            if best_b_i_c__score > b_i_c__score:
                best_hmm_model = hmm_model
                best_b_i_c__score = b_i_c__score

        return best_hmm_model
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        best_n = self.min_n_components
        min = float("+inf")
        #calculate the BIC for each number of n_components..
        for i in range(self.min_n_components, self.max_n_components + 1):
            try:
                model = GaussianHMM(n_components=i,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(self.X, self.lengths)
                log_likelyhood = model.score(self.X, self.lengths)
                #now calculating the bic.. BIC=-2*LOGL + plogN
                p = i * i + 2 * i * len(self.X[0]) - 1
                BIC = -2 * log_likelyhood + p * (math.log(len(self.X[0])))
                #keeping track of the lowest value of BIC which would be the best for model..
                if BIC < min:
                    min = BIC
                    best_n = i
            except:
                pass
        #return the best model..
        return GaussianHMM(n_components=best_n,
                           covariance_type="diag",
                           n_iter=1000,
                           random_state=self.random_state,
                           verbose=False).fit(self.X, self.lengths)
Ejemplo n.º 15
0
    def base_model(self,
                   num_states,
                   X=None,
                   lens=None,
                   testX=None,
                   testlens=None):
        # with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        # warnings.filterwarnings("ignore", category=RuntimeWarning)
        if X is None:
            X = self.X
        if lens is None:
            lens = self.lengths
        try:
            hmm_model = GaussianHMM(n_components=num_states,
                                    covariance_type="diag",
                                    n_iter=1000,
                                    random_state=self.random_state,
                                    verbose=False).fit(X, lens)
            if self.verbose:
                print("model created for {} with {} states".format(
                    self.this_word, num_states))

            if testX is not None:
                return hmm_model, hmm_model.score(testX, testlens)
            return hmm_model
        except:
            if self.verbose:
                print("failure on {} with {} states".format(
                    self.this_word, num_states))
            return None
Ejemplo n.º 16
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on BIC scores
        best_model = None
        best_num_components = self.min_n_components
        best_bic = float('+inf')

        for num_states in range(self.min_n_components, self.max_n_components):

            try:
                # train model with training set
                hmm_model = GaussianHMM(n_components=num_states,
                                        n_iter=2000).fit(self.X, self.lengths)
                likelyhood = hmm_model.score(self.X, self.lengths)

                p = num_states ^ 2 + 2 * num_states * hmm_model.n_features - 1

                # now calculate bic
                bic = -2 * likelyhood + p * np.log(hmm_model.n_features)

                if bic < best_bic:
                    # new set of best numbers
                    best_num_components, best_bic, best_model = num_states, bic, hmm_model

            except Exception:
                # if it fails, it will try again with the next set of elements, or simply return an empty model
                pass

        return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        if len(self.sequences) < 2:
            return self.base_model(3)
        if len(self.sequences) == 2:
            n_splits = 2
        else:
            n_splits = 3

        split_method = KFold(n_splits)

        logL = np.zeros([n_splits, self.max_n_components + 1 - self.min_n_components])

        for pair_index, pairs in enumerate(split_method.split(self.lengths)):
            train, test = pairs

            train_X, train_length = combine_sequences(train, self.sequences)
            test_X, test_length = combine_sequences(test, self.sequences)

            for state_index, num_states in enumerate(range(self.min_n_components, self.max_n_components + 1)):
                logL[pair_index][state_index] = float('-inf')

                try:
                    model = GaussianHMM(n_components=num_states, covariance_type='diag', n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(train_X, train_length)

                    logL[pair_index][state_index] = model.score(test_X, test_length)
                except:
                    continue

        best_num_states = self.min_n_components + np.argmax(logL.sum(axis=0))
        return self.base_model(best_num_states)
Ejemplo n.º 18
0
def fit_hmm_learn(seqs, n_states, axis):
    """
    Seqs is a list of numpy vectors
    """
    samples = np.concatenate(seqs)
    lengths = np.array([len(s) for s in seqs])
    if len(samples) < n_states:
        return float('inf'), float('-inf'), None, None
    # assert len(samples) >= n_states
    hmm = GaussianHMM(n_components=n_states)
    hmm.fit(samples, lengths)

    ll = hmm.score(samples, lengths)
    _, labels = hmm.decode(samples, lengths)

    axis.set_title("HMM Learn (ll=%0.2f)" % ll)
    # ax2.plot(means[:, 0], means[:, 1], 'ro')
    # ax2.plot(X[:, :, 0], X[:, :, 1], 'bo')

    possible_colors = ['orange', 'blue', 'green', 'red']
    colors = [possible_colors[e] for e in labels]
    axis.scatter(seqs[:100, :, 0],
                 seqs[:100, :, 1],
                 color=colors[:100],
                 marker='^')
    axis.scatter(seqs[100:200, :, 0],
                 seqs[100:200, :, 1],
                 color=colors[100:200],
                 marker='o')
    axis.scatter(seqs[200:, :, 0],
                 seqs[200:, :, 1],
                 color=colors[200:],
                 marker='s')
    return labels
 def select(self):
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     split_method = KFold()
     best_n = self.min_n_components
     best_score = float("-inf")
     #Iterate for all possible number of states...
     for i in range(self.min_n_components, self.max_n_components + 1):
         try:
             count = 0
             total = 0
             #for each combination of folds which result due to split method, get the train and test samples..
             for cv_train_idx, cv_test_idx in split_method.split(
                     self.sequences):
                 #now subsets must be combined based on indices given for the folds..
                 train_set, train_length = combine_sequences(
                     cv_train_idx, self.sequences)
                 test_set, test_length = combine_sequences(
                     cv_test_idx, self.sequences)
                 #now create a model using the training samples selected just now..
                 new_model = GaussianHMM(i, n_iter=1000).fit(
                     train_set, train_length)
                 #now calculate the score and test how well this newly created model is performing..
                 new_score = new_model.score(test_set, test_length)
                 total = total + new_score
                 count += 1
             avg_score = total / count
             #this average score corresponds to the performance of the model using i number of n_components..
             if (avg_score > best_score):
                 best_score = avg_score
                 best_n = i
         except:
             pass
     return GaussianHMM(best_n, n_iter=1000).fit(self.X, self.lengths)
Ejemplo n.º 20
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        max_logL_model = []
        max_logL_mean = -math.inf
        for n_component in range(self.min_n_components,
                                 self.max_n_components + 1):
            try:
                split_method = KFold(n_splits=min(3, len(self.sequences)))
                curr_logL = []
                curr_model = []
                for cv_train, cv_test in split_method.split(self.sequences):
                    X_train, lengths_train = combine_sequences(
                        cv_train, self.sequences)
                    X_test, lengths_test = combine_sequences(
                        cv_test, self.sequences)

                    curr_model = GaussianHMM(n_components=n_component,
                                             covariance_type="diag",
                                             n_iter=1000,
                                             random_state=self.random_state,
                                             verbose=False).fit(
                                                 X_train, lengths_train)
                    curr_logL.append(curr_model.score(X_test, lengths_test))

                curr_logL_mean = np.mean(curr_logL)
                if curr_logL_mean > max_logL_mean:
                    max_logL_model = curr_model
                    max_logL_mean = curr_logL_mean
            except:
                pass

        return max_logL_model
Ejemplo n.º 21
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_model = None
        best_score = float("-inf")
        for num_states in range(self.min_n_components, self.max_n_components):
            if len(self.sequences) == 1:
                continue
            split_method = KFold(
                n_splits=len(self.sequences) if len(self.sequences) < 3 else 3)
            iter_scores = []
            for cv_train_idx, cv_test_idx in split_method.split(
                    self.sequences):
                try:
                    X, L = combine_sequences(cv_train_idx, self.sequences)
                    hmmmodel = GaussianHMM(n_components=num_states,
                                           covariance_type="diag",
                                           n_iter=1000,
                                           random_state=self.random_state,
                                           verbose=False).fit(X, L)
                    X_test, L_test = combine_sequences(cv_test_idx,
                                                       self.sequences)
                    logL = hmmmodel.score(X_test, L_test)
                    iter_scores.append(logL)
                except:
                    continue
                avg_iter_score = np.average(iter_scores)
                if avg_iter_score > best_score:
                    best_model = hmmmodel
                    best_score = avg_iter_score
        return best_model
Ejemplo n.º 22
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection based on BIC scores
        best_model = None
        best_bic = float("inf")
        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                hmm_model = GaussianHMM(n_components=n,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False).fit(
                                            self.X, self.lengths)
                logL = hmm_model.score(self.X, self.lengths)
                p = (n * n) + (2 * n * self.X.shape[1]) - 1
                bic = -2 * logL + p * np.log(len(self.X))

                if bic < best_bic:
                    best_bic = bic
                    best_model = hmm_model
            except:
                pass

        return best_model
Ejemplo n.º 23
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # implement model selection using CV
        best_score = float('-inf')
        best_model = None
        word_sequences = self.sequences
        split_method = KFold(n_splits=max(2, min(3, len(word_sequences))))
        for num_states in range(self.min_n_components,
                                self.max_n_components + 1):
            cv_fold_scores = []
            try:
                for cv_train_idx, cv_test_idx in split_method.split(
                        word_sequences):
                    train_x, train_length = combine_sequences(
                        cv_train_idx, word_sequences)
                    test_x, test_length = combine_sequences(
                        cv_test_idx, word_sequences)
                    hmm_model = GaussianHMM(n_components=num_states,
                                            covariance_type="diag",
                                            n_iter=1000,
                                            random_state=self.random_state,
                                            verbose=False).fit(
                                                train_x, train_length)
                    cv_fold_scores.append(hmm_model.score(test_x, test_length))
                cv_fold_mean = np.mean(cv_fold_scores)
                if cv_fold_mean > best_score:
                    best_score = cv_fold_mean
                    best_model = hmm_model
            except:
                continue

        return best_model
Ejemplo n.º 24
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # implement model selection based on BIC scores
        best_score = float('inf')
        best_model = None

        for num_states in range(self.min_n_components,
                                self.max_n_components + 1):
            try:
                hmm_model = GaussianHMM(n_components=num_states,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False).fit(
                                            self.X, self.lengths)
                logL = hmm_model.score(self.X, self.lengths)
                logN = math.log(len(self.X))
                num_features = len(self.X[0])
                p = num_states**2 + 2 * num_states * num_features - 1
                bic = -2 * logL + p * logN

                if bic < best_score:
                    best_score = bic
                    best_model = hmm_model
            except:
                continue

        return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        # TODO implement model selection using CV
        best_score = float("-inf")
        best_model = self.base_model(self.n_constant)

        if len(self.sequences)<3:
            return best_model

        for n_components in range(self.min_n_components, self.max_n_components + 1):
            splits = KFold(min(3, len(self.sequences))).split(self.sequences)
            scores = []
            for train, test in splits:
                train_X, train_lengths = combine_sequences(train, self.sequences)
                try:
                    hmm_model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000,
                                            random_state=self.random_state, verbose=False).fit(train_X, train_lengths)
                    test_X, test_lengths = combine_sequences(test, self.sequences)
                    scores.append(hmm_model.score(test_X, test_lengths))
                except:
                    pass

            if np.average(scores) > best_score:
                best_score = np.average(scores)
                best_model = self.base_model(n_components)

        return best_model
Ejemplo n.º 26
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        # implement model selection based on BIC scores
        if self.verbose: print("="*10,"BIC","="*50);
        scores = {};
        for n in range(self.min_n_components,self.max_n_components+1):
            if self.verbose: print("=== n = %d" % n);
            try:
                hmm_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                if self.verbose:
                    print("model created for {} with {} states".format(self.this_word, n))
                #BIC score
                #number of features is 4 for all, i.e,
                #grnd-ry, grnd-rx, grnd-ly, grnd-lx
                #norm-rx, norm-ry, norm-lx,norm-ly
                #delta-rx, delta-ry, delta-lx, delta-ly
                #norm-polar-rr, norm-polar-rtheta, norm-polar-lr, norm-polar-ltheta
                n_features = len(self.X[0]); #was hard coded to 4
                p = n**2 + 2*n*n_features - 1;
                s = -2 * hmm_model.score(self.X,self.lengths) + p*np.log(len(self.lengths));
                if self.verbose:
                    print("Size X,lengths is %.2f,%.2f" % (len(self.X),len(self.lengths)));
                    print("Score is: %f" % s);
                scores[s] = hmm_model;
            except:
                if self.verbose:
                    print("failure on {} with {} states".format(self.this_word, n))
        return scores[min(scores.keys())] if len(scores) > 0 else None;
Ejemplo n.º 27
0
 def select(self):
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     if len(self.sequences) < 3:
         return self.base_model(self.n_constant)
     logL = float("-inf")
     best_model = None
     for state in range(self.min_n_components, self.max_n_components + 1):
         split_data = KFold()
         score = 0
         count = 0
         for train_index, test_index in split_data.split(self.sequences):
             try:
                 X, lens = combine_sequences(train_index, self.sequences)
                 model = GaussianHMM(n_components=state,
                                     covariance_type="diag",
                                     n_iter=1000,
                                     random_state=self.random_state,
                                     verbose=False).fit(X, lens)
                 X, lens = combine_sequences(test_index, self.sequences)
                 score += model.score(X, lens)
                 count += 1
             except:
                 continue
         if count:
             mean = score / count
         else:
             mean = float("-inf")
         if mean > logL:
             logL = mean
             best_model = model
     return best_model
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        bestScore = float("-inf")
        bestModel = None
        for nComponents in range(self.min_n_components, self.max_n_components + 1):
            scores = [] 
            nSplits = 3
            model, logL = None, None
            if(len(self.sequences) < nSplits):
                break
            splitMethod = KFold(random_state=self.random_state, n_splits=nSplits)
            for cv_train_ids, cv_test_ids in splitMethod.split(self.sequences):
                X_train, lengths_train = combine_sequences(cv_train_ids, self.sequences)
                X_test,  lengths_test  = combine_sequences(cv_test_ids, self.sequences)
                try:
                    model = GaussianHMM(nComponents=n_components, covariance_type="diag", n_iter=1000,
                                    random_state=inst.random_state, verbose=False).fit(X_train, lengths_train)
                    logL = model.score(X_test, lengths_test)
                    scores.append(logL)
                except Exception as e:
                    break
            
            av = np.average(scores) if len(scores) > 0 else float("-inf")
            
            if av > bestScore:
                bestScore, bestModel = av, model
        
        return bestModel if bestModel is not None else self.base_model(self.n_constant)
Ejemplo n.º 29
0
    def select(self):
        """ select the best model for self.this_word based on
                BIC score for n between self.min_n_components and self.max_n_components

                :return: GaussianHMM object
                """
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        best_model = None
        best_score = float("inf")
        for num_states in range(self.min_n_components, self.max_n_components):
            try:
                hmmmodel = GaussianHMM(n_components=num_states,
                                       covariance_type="diag",
                                       n_iter=1000,
                                       random_state=self.random_state,
                                       verbose=False).fit(
                                           self.X, self.lengths)
                logL = hmmmodel.score(self.X, self.lengths)
                initialStateProbs = num_states
                transitionProbs = num_states * (num_states - 1)
                emissionProbs = len(np.diagonal(hmmmodel.means_)) + len(
                    np.diagonal(hmmmodel.covars_))
                p = initialStateProbs + transitionProbs + emissionProbs
                BIC_Score = -2 * logL + p * np.log(len(self.X))
                if BIC_Score < best_score:
                    best_score = BIC_Score
                    best_model = hmmmodel
            except:
                continue
        return best_model
Ejemplo n.º 30
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        best_model = self.base_model(self.n_constant)
        best_LogL = float('-inf')
        if len(self.sequences) < 3:
            return best_model
        split_method = KFold()
        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                folds = 0
                total_LogL = 0
                for cv_train_idx, cv_test_idx in split_method.split(
                        self.sequences):
                    folds += 1
                    X_train, X_lengths_train = combine_sequences(
                        cv_train_idx, self.sequences)
                    X_test, X_lengths_test = combine_sequences(
                        cv_test_idx, self.sequences)
                    model = GaussianHMM(n_components=n,
                                        covariance_type="diag",
                                        n_iter=1000,
                                        random_state=self.random_state,
                                        verbose=False).fit(
                                            X_train, X_lengths_train)
                    total_LogL += model.score(X_test, X_lengths_test)
                average_LogL = total_LogL / folds
                if average_LogL > best_LogL:
                    best_model = model
                    best_LogL = average_LogL
            except:
                continue

        return best_model
Ejemplo n.º 31
0
 def cv_loop(num_components):
     """ CV loop helper function """
     logLs = []
     # I thought I needed to do something like this (as it was failing for FISH) but I confirmed it using the forums: https://discussions.udacity.com/t/selectorcv-fails-to-train-fish/338796
     split_method = KFold(n_splits=min(3, len(self.sequences)))
     # for each fold
     for cv_train_idx, cv_test_idx in split_method.split(
             self.sequences):
         try:
             # we get X and lengths for both train and test set
             X_train, lengths_train = combine_sequences(
                 cv_train_idx, self.sequences)
             X_test, lengths_test = combine_sequences(
                 cv_test_idx, self.sequences)
             # we train the model
             current_model = GaussianHMM(n_components=num_components,
                                         covariance_type="diag",
                                         n_iter=1000,
                                         random_state=self.random_state,
                                         verbose=False).fit(
                                             X_train, lengths_train)
             # and we append the logL to our list
             logLs.append(current_model.score(X_test, lengths_test))
         except:
             # copied from the function above (base_model)
             if self.verbose:
                 print(
                     "failure on {} with {} states, continuing".format(
                         self.this_word, num_components))
             continue
     # if we found at least one logL we return the average
     if len(logLs) > 0:
         return (sum(logLs) / len(logLs))
     else:
         return float('-Inf')
Ejemplo n.º 32
0
    def select(self):
        """ select the best model for self.this_word based on
        BIC score for n between self.min_n_components and self.max_n_components

        :return: GaussianHMM object
        """
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        
        best = (None, float('inf')) # Tuple (model, BIC score)

        for n in range(self.min_n_components, self.max_n_components):
            try:
                # Train HMM
                model = GaussianHMM(n_components=n, n_iter=1000,
                                    random_state=self.random_state).fit(self.X, self.lengths)
    
                logL = model.score(self.X, self.lengths)
                logN = np.log(len((self.lengths))) # N is number of data points
                p = n ** 2 + 2 * n * model.n_features - 1 # p is number of parameters
    
                # Calculate BIC (Bayesian Information Criteria) score      
                score = -2 * logL + p * logN
                # If BIC score is better than previous best, store model and the score
                if score < best[1]:
                    best = model, score
            except:
               pass
           
        return best[0]
Ejemplo n.º 33
0
 def select(self):
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     word_sequences = self.sequences
     split_method = KFold(n_splits=max(2, min(5, len(word_sequences))))
     max_score = None
     best_model = None
     for num_states in range(self.min_n_components,
                             self.max_n_components + 1):
         try:
             scores_list = []
             for cv_train_idx, cv_test_idx in split_method.split(
                     word_sequences):
                 train_data, train_length = combine_sequences(
                     cv_train_idx, word_sequences)
                 test_data, test_length = combine_sequences(
                     cv_test_idx, word_sequences)
                 hmm_model = GaussianHMM(n_components=num_states,
                                         covariance_type="diag",
                                         n_iter=1000,
                                         random_state=self.random_state,
                                         verbose=False).fit(
                                             train_data, train_length)
                 score = hmm_model.score(test_data, test_length)
                 scores_list.append(score)
             tmp_score = np.mean(scores_list)
             if (max_score == None or tmp_score > max_score):
                 max_score = tmp_score
                 best_model = hmm_model
         except:
             continue
     return best_model
Ejemplo n.º 34
0
 def run_model(self, n_components, X_train, lengths_train, X_test,
               lengths_test):
     model = GaussianHMM(n_components=n_components, covariance_type="diag",
                         n_iter=1000, random_state=self.random_state,
                         verbose=False).fit(X_train, lengths_train)
     logL = model.score(X_test, lengths_test)
     return logL
Ejemplo n.º 35
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        best_score = float('-inf')
        best_model = None
        sum_score = 0.0
        counter = 0.0
        if len(self.sequences) >= 3:
            n_splits = min(3, len(self.sequences))
            splits = KFold(n_splits)
            for n in range(self.min_n_components, self.max_n_components+1):
                try:
                    for train_index, test_index in splits.split(self.sequences):
                        # used forum code to get train/test X,Lengths respectively: https://discussions.udacity.com/t/selectorcv-crashes/400125
                        train_X, train_lengths = combine_sequences(train_index, self.sequences)
                        test_X, test_lengths = combine_sequences(test_index, self.sequences)
                        model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                                random_state=self.random_state, verbose=False).fit(train_X, train_lengths)
                        score = model.score(test_X, test_lengths)
                        sum_score += score
                        counter += 1
                    # used average score from udacity forum: https://discussions.udacity.com/t/my-selectorcv-class/349110
                    average_score = sum_score / counter
                    if average_score > best_score:
                        best_score = average_score
                        best_model = model
                except:
                    continue
        # for models with length less than 3
        else:
            best_score_1 = float('inf')
            best_model = None
            for n in range(self.min_n_components, self.max_n_components+1):
                try:
                    model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                            random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    logL = model.score(self.X, self.lengths)
                    # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10
                    p = n ** 2 + 2 * n * len(self.X[0])  - 1
                    N = len(self.X)
                    score_1 = -2 * logL + p * np.log(N)
                    if score_1 < best_score_1:
                        best_score_1 = score_1
                        best_model = model
                except:
                    continue
        return best_model
Ejemplo n.º 36
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        max_score = None
        max_model = None

        for n in range(self.min_n_components, self.max_n_components + 1):
            try:
                all_score = 0.0
                qty = 0
                final_model = None
                if (len(self.sequences) >= 2):
                    # Generate K folds
                    folds = min(len(self.sequences),3)
                    split_method = KFold(shuffle=True, n_splits=folds)
                    parts = split_method.split(self.sequences)
                    for cv_train_idx, cv_test_idx in parts:
                        # Kfold information for train
                        X_train, lengths_train = np.asarray(combine_sequences(cv_train_idx, self.sequences))
                        # Kfold information for test
                        X_test, lengths_test = np.asarray(combine_sequences(cv_test_idx, self.sequences))
                        # Fit model with train data
                        model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(X_train, lengths_train)
                        # Get score using test data
                        all_score = all_score+model.score(X_test,lengths_test)
                        qty = qty+1
                    # Calculate score
                    score = all_score / qty
                else:
                    # cant be fold
                    final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                        random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    score = model.score(self.X, self.lengths)
                # Keep model with best score
                if max_score is None or max_score < score:
                    max_score = score
                    if final_model is None:
                        final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000,
                                                  random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
                    max_model = final_model

            except:
                pass

        return max_model
Ejemplo n.º 37
0
Archivo: hmm.py Proyecto: mkdmkk/infaas
class HMM:
    __slots__ = [
        "model"
    ]

    def __init__(self):
        pass


    def draw(self, data):
        figure()
        plot(range(len(data)),data,alpha=0.8,color='red')
        show()


    def train(self, data, n_components):
        print("Training Data: %s" % data)
        self.data = data
        self.model = GaussianHMM(n_components, algorithm='viterbi', covariance_type='diag')
        X = np.reshape(data, (len(data),1))
        self.model = self.model.fit([X])

        self.hidden_states = self.model.predict(X)
        print("Sequence of States: " % self.hidden_states)


    def eval(self, obs):
        print("Testing Data: %s" % obs)
        X = np.reshape(obs, (len(obs),1))
        print("Eval: %s" % str(self.model.score(X)))


    def plot(self):
        fig = figure(facecolor="white")
        ax = fig.add_subplot(111)

        for i in range(self.model.n_components):
            # use fancy indexing to plot data in each state
            idx = (self.hidden_states == i)
            ax.plot(np.array(range(len(self.data)))[idx], np.array(self.data)[idx], '.', label="State %d" % (i+1))

        ax.legend()
        show()
Ejemplo n.º 38
0
class HmmClassifier():
    def __init__(self, referenceSeqs, inputSeq):
        self.referenceSeqs = referenceSeqs
        self.inputSeq = inputSeq

        # feel free to change this model
        self.model = GaussianHMM(n_components=2, covariance_type="full", n_iter=2000)

    def predict(self):
        probs = []
        for referenceSeq in self.referenceSeqs:
            #print "reference: {}".format(referenceSeq)
            self.model.fit(referenceSeq)
            hidden_states = self.model.predict(referenceSeq)
            prob = self.model.score(self.inputSeq)
            probs.append(prob)

        # return the index of the max prob
        return probs.index(max(probs))
Ejemplo n.º 39
0
    def select(self):
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        # TODO implement model selection using CV
        # raise NotImplementedError
        record = float("-inf")

        min_seq = min([len(seq) for seq in self.sequences])    
        self.max_n_components = min (self.max_n_components, min_seq) 
        hmm_model = self.base_model(self.n_constant)
        if len(self.sequences) == 1:
            return hmm_model
        elif len(self.sequences) == 2:
            split_method = KFold(n_splits=2)
            #self.max_n_components = 3
        else:
            split_method = KFold(n_splits=3,random_state=self.random_state)
        
        
        for num in range(self.min_n_components,self.max_n_components+1,1):
            #print(num)
            logL = 0
            cnt = 0
            
            for cv_train_idx, cv_test_idx in split_method.split(self.sequences):
                #print("Train fold indices:{} Test fold indices:{}".format(cv_train_idx, cv_test_idx))  # view indices of the folds
                X, lengths = combine_sequences(cv_train_idx,self.sequences)
                try:
                    model = GaussianHMM(n_components= num, n_iter=1000).fit(X, lengths)
                    X, lengths = combine_sequences(cv_test_idx,self.sequences)
                    logL += model.score(X, lengths)
                except:
                    continue
                    #print("failure on {} with {} states".format(self.this_word, num))                      
            if cnt> 0 and logL/cnt > record:
                record = logL
                hmm_model = model   
        return hmm_model
Ejemplo n.º 40
0
class HMM(object):

    def __init__(self):

        def setup():

            def load_patterns(file):
                patterns = None
                sizes = np.zeros(len(words))
                counter = 0

                f = open(file, 'rb')
                data = f.readlines()

                stack = []
                for i in range(np.shape(data)[0]):
                    data2 = map(float, data[i].split())
                    data2 = np.reshape(data2, (1, -1))
                    if i == 0:
                        stack = data2
                    else:
                        stack = np.vstack((stack, data2))

                f.close()
                sizes[counter] = np.shape(stack)[0]
                counter += 1

                if patterns is None:
                    patterns = stack
                else:
                    patterns = np.vstack((patterns, stack))

                return patterns

            hidden = 1

            self.go_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('go.bin'))

            self.back_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('back.bin'))

            self.right_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('right.bin'))

            self.left_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('left.bin'))

            self.stop_model = GaussianHMM(n_components=hidden, covariance_type="diag", n_iter=10000).fit(
                load_patterns('stop.bin'))

        setup()
        self.number_of_components = 5

    def match(self, pattern):

        probabilities = np.zeros(5)
        probabilities[0] = self.go_model.score(np.reshape(pattern, (1, -1)))
        probabilities[1] = self.back_model.score(np.reshape(pattern, (1, -1)))
        probabilities[2] = self.right_model.score(np.reshape(pattern, (1, -1)))
        probabilities[3] = self.left_model.score(np.reshape(pattern, (1, -1)))
        probabilities[4] = self.stop_model.score(np.reshape(pattern, (1, -1)))

        probabilities = abs(probabilities)

        index, error = min(enumerate(probabilities), key=lambda x: x[1])

        if error < 9500:
            if index == 0:
                return 0
            elif index == 1:
                return 1
            elif index == 2:
                return 2
            elif index == 3:
                return 3
            else:
                return 4
        return -1
tot_words = len(correct_answers)
right = 0.0
threshold = 1.5

for i in xrange(tot_words):
	try:
		(rate,sig) = wav.read('Test/'+test_folder+"/word" + str(i) + ".wav")
		features = get_features(sig)
		word_len = len(features)
		ans = -1
		j = -1
		max_ans = -1e9
		for model in models:
			j = j+1
			if math.fabs(word_len - means[j]) <= threshold * std_devs[j]:
				temp = model.score(features)
				if temp>max_ans:
					max_ans = temp
					ans = j
	
		#print max_ans
		print str(i+1)+". Detected word: "+spoken[ans]
		if spoken[ans] == correct_answers[i][0]:
			right = right + 1
	except:
		break
	    
cur_accuracy = (right/tot_words)*100
print "Accuracy = "+str(cur_accuracy)+"%"

if cur_accuracy > accuracy:
spx_price = spx_price['Close']
spx_price.name = 'SPX Index'
spx_ret = spx_price.shift(1)/ spx_price[1:] - 1
spx_ret.dropna(inplace=True)
#spx_ret = spx_ret * 1000.0
rets = np.column_stack([spx_ret])

# Create the Gaussian Hidden markov Model and fit it
# to the SPY returns data, outputting a score
hmm_model = GaussianHMM(
    n_components=3,                     # number of states
    covariance_type="full",             # full covariance matrix vs diagonal
    n_iter=1000                         # number of iterations
).fit(rets)

print("Model Score:", hmm_model.score(rets))

# Plot the in sample hidden states closing values
# Predict the hidden states array
hidden_states = hmm_model.predict(rets)

print('Percentage of hidden state 1 = %f' % (sum(hidden_states)/len(hidden_states)))

print("Transition matrix")
print(hmm_model.transmat_)

print("Means and vars of each hidden state")
for i in range(hmm_model.n_components):                   # 0 is down, 1 is up
    print("{0}th hidden state".format(i))
    print("mean = ", hmm_model.means_[i])
    print("var = ", np.diag(hmm_model.covars_[i]))
Ejemplo n.º 43
0
    for grp, files in filesorter.groupby('group'):    
        list_of_datasets = []   
        lengths = []
        for fn in files.index:
            try: 
                fbf = pd.read_pickle(files.ix[fn]['filepath'])
                x_ = np.column_stack(fbf[ i +'_smoothed'] for i in parameters)
                list_of_datasets.append(x_)
                lengths.append(len(x_))
            except: 
                print 'failed to complete: ', grp, fn
        X = np.concatenate(list_of_datasets)
        np.save(DATADIR + 'HMM_JAR/' + grp +'_X.npy', X)
        np.save(DATADIR + 'HMM_JAR/' + grp +'_lengths.npy', lengths)
        model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000).fit(X, lengths)
        likelihoods.append(model.score(X))
        joblib.dump(model, DATADIR + 'HMM_JAR/' + grp +'_model.pkl')
        groups.append(grp)

    #  MAKE ONE MODEL PER TREATMENT:

    for grp in treatments:
        
        list_of_datasets = [] 
        for data in glob.glob( DATADIR + 'HMM_JAR/*'+ grp +'_X.npy'):
            list_of_datasets.append(np.load(data))
        X = np.concatenate(list_of_datasets)
        lengths = []
        for l in glob.glob( DATADIR + 'HMM_JAR/*'+ grp +'_lengths.npy'):
            lengths.append(np.load(l))
        lengths = np.concatenate(lengths)