def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # Implement model selection using CV NB_SPLITS = 3 mean_scores = [] split_method = KFold(random_state=self.random_state, n_splits=NB_SPLITS) n_components = range(self.min_n_components, self.max_n_components + 1) try: for n_component in n_components: model = self.base_model(n_component) kfold_scores = [] for _, test_idx in split_method.split(self.sequences): test_X, test_length = combine_sequences(test_idx, self.sequences) kfold_scores.append(model.score(test_X, test_length)) mean_scores.append(np.mean(kfold_scores)) except Exception as e: pass if len(mean_scores) > 0: states = n_components[np.argmax(mean_scores)] else: states = self.n_constant return self.base_model(states)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float('-inf') best_model = None sum_score = 0.0 counter = 0.0 if len(self.sequences) >= 3: n_splits = min(3, len(self.sequences)) splits = KFold(n_splits) for n in range(self.min_n_components, self.max_n_components+1): try: for train_index, test_index in splits.split(self.sequences): # used forum code to get train/test X,Lengths respectively: https://discussions.udacity.com/t/selectorcv-crashes/400125 train_X, train_lengths = combine_sequences(train_index, self.sequences) test_X, test_lengths = combine_sequences(test_index, self.sequences) model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_lengths) score = model.score(test_X, test_lengths) sum_score += score counter += 1 # used average score from udacity forum: https://discussions.udacity.com/t/my-selectorcv-class/349110 average_score = sum_score / counter if average_score > best_score: best_score = average_score best_model = model except: continue # for models with length less than 3 else: best_score_1 = float('inf') best_model = None for n in range(self.min_n_components, self.max_n_components+1): try: model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) logL = model.score(self.X, self.lengths) # equation from udacity forum: https://discussions.udacity.com/t/how-to-start-coding-the-selectors/476905/10 p = n ** 2 + 2 * n * len(self.X[0]) - 1 N = len(self.X) score_1 = -2 * logL + p * np.log(N) if score_1 < best_score_1: best_score_1 = score_1 best_model = model except: continue return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) max_score = None max_model = None for n in range(self.min_n_components, self.max_n_components + 1): try: all_score = 0.0 qty = 0 final_model = None if (len(self.sequences) >= 2): # Generate K folds folds = min(len(self.sequences),3) split_method = KFold(shuffle=True, n_splits=folds) parts = split_method.split(self.sequences) for cv_train_idx, cv_test_idx in parts: # Kfold information for train X_train, lengths_train = np.asarray(combine_sequences(cv_train_idx, self.sequences)) # Kfold information for test X_test, lengths_test = np.asarray(combine_sequences(cv_test_idx, self.sequences)) # Fit model with train data model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X_train, lengths_train) # Get score using test data all_score = all_score+model.score(X_test,lengths_test) qty = qty+1 # Calculate score score = all_score / qty else: # cant be fold final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) score = model.score(self.X, self.lengths) # Keep model with best score if max_score is None or max_score < score: max_score = score if final_model is None: final_model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) max_model = final_model except: pass return max_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV # raise NotImplementedError record = float("-inf") min_seq = min([len(seq) for seq in self.sequences]) self.max_n_components = min (self.max_n_components, min_seq) hmm_model = self.base_model(self.n_constant) if len(self.sequences) == 1: return hmm_model elif len(self.sequences) == 2: split_method = KFold(n_splits=2) #self.max_n_components = 3 else: split_method = KFold(n_splits=3,random_state=self.random_state) for num in range(self.min_n_components,self.max_n_components+1,1): #print(num) logL = 0 cnt = 0 for cv_train_idx, cv_test_idx in split_method.split(self.sequences): #print("Train fold indices:{} Test fold indices:{}".format(cv_train_idx, cv_test_idx)) # view indices of the folds X, lengths = combine_sequences(cv_train_idx,self.sequences) try: model = GaussianHMM(n_components= num, n_iter=1000).fit(X, lengths) X, lengths = combine_sequences(cv_test_idx,self.sequences) logL += model.score(X, lengths) except: continue #print("failure on {} with {} states".format(self.this_word, num)) if cnt> 0 and logL/cnt > record: record = logL hmm_model = model return hmm_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) split_method = KFold() best_score = float("-inf") best_model = None for n_components in range(self.min_n_components, self.max_n_components + 1): scores = [] hmm_model = None try: for cv_train_idx, cv_test_idx in split_method.split( self.sequences): X, lengths = combine_sequences(cv_train_idx, self.sequences) test_X, test_lengths = combine_sequences( cv_test_idx, self.sequences) hmm_model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X, lengths) scores.append(hmm_model.score(test_X, test_lengths)) avg_score = np.mean(scores) if (avg_score > best_score): best_score = avg_score best_model = hmm_model except: pass return best_model if best_model is not None else self.base_model( self.n_constant)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_n = None best_avg_logL = float("-inf") n_splits = min(self.min_n_components, len(self.lengths)) if n_splits < 2: n_splits = 2 split_method = KFold(n_splits=n_splits) for n in range(self.min_n_components, self.max_n_components): try: folds = total_logL = 0 for train_idx, test_idx in split_method.split(self.sequences): folds += 1 train_X, train_lengths = combine_sequences( train_idx, self.sequences) test_X, test_lengths = combine_sequences( test_idx, self.sequences) try: model = GaussianHMM( n, n_iter=1000, random_state=self.random_state).fit( train_X, train_lengths) score = model.score(test_X, test_lengths) total_logL += score except: pass avg_logL = total_logL / folds if avg_logL > best_avg_logL: best_avg_logL = avg_logL best_n = n except: pass # retrain using best n_components with FULL DATA SET, not split set # used above if best_n is not None: return self.base_model(best_n) return None
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV best_score = float('-inf') best_n = self.n_constant split_method = KFold(random_state=self.random_state) for n in range(self.min_n_components, self.max_n_components + 1): try: scores = [] for cv_train_idx, cv_test_idx in split_method.split( self.sequences): train_X, train_lengths = combine_sequences( cv_train_idx, self.sequences) test_X, test_lengths = combine_sequences( cv_test_idx, self.sequences) model = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( train_X, train_lengths) score = model.score(test_X, test_lengths) scores.append(score) mean_score = np.mean(scores) if mean_score > best_score: best_score = mean_score best_n = n except: if self.verbose: print("CV: failure on {} with {} states".format( self.this_word, n)) return self.base_model(best_n)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) split_method = KFold(n_splits=2, shuffle=True) highest_logL = float('-inf') best_model = None try: for n_components in range(self.min_n_components, self.max_n_components + 1): for cv_train_idx, cv_test_idx in split_method.split( self.sequences): x_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( x_train, lengths_train) x_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) logL = model.score(x_test, lengths_test) if logL > highest_logL: highest_logL = logL best_model = model return best_model except: if best_model: return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV score_cv = float('-inf') model_cv = None logL_array = [] #kf = KFold(n_splits=min(3, len(self.lengths))) #num_hidstates = self.max_n_components - self.min_n_components for component_n in range(self.min_n_components, self.max_n_components + 1): kf = KFold(n_splits=min(3, len(self.lengths))) for train_n, test_n in kf.split(self.sequences): try: #x1, len1 = train_n.get_word_Xlengths(self.words) x_train, len_train = combine_sequences( train_n, self.sequences) x_test, len_test = combine_sequences( test_n, self.sequences) model = GaussianHMM(n_components=component_n, n_iter=1000).fit(x_train, len_train) logL = model.score(x_test, len_test) logL_array.append(logL) except: pass mean_score = np.mean(logL_array) if mean_score > score_cv: score_cv = mean_score model_cv = model return model_cv
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV best_score = float('-inf') best_model = None split_method = KFold() for n_components in range(self.min_n_components, self.max_n_components + 1): score = 0 folds = 0 try: if len(self.sequences) > 2: for train_i, test_i in split_method.split(self.sequences): train_X, train_l = combine_sequences( train_i, self.sequences) test_X, test_l = combine_sequences( test_i, self.sequences) model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( train_X, train_l) score += model.score(test_X, test_l) folds += 1 avg_score = score / folds else: model = self.base_model(n_components) avg_score = model.score(self.X, self.lengths) if avg_score > best_score: best_score = avg_score best_model = model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) split_method = KFold(n_splits=3) model_and_scores = [] if len(self.sequences)>2: for current_n_components in range(self.min_n_components,self.max_n_components+1): scores = [] for cv_train_idx, cv_test_idx in split_method.split(self.sequences): train_X, train_lenghts = combine_sequences(cv_train_idx,self.sequences) test_X, test_lenghts = combine_sequences(cv_test_idx, self.sequences) try: hmm_model = GaussianHMM(n_components=current_n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_lenghts) scores.append(hmm_model.score(test_X,test_lenghts)) if self.verbose: print("model created for {} with {} states".format(self.this_word, current_n_components)) except: if self.verbose: print("failure on {} with {} states".format(self.this_word, current_n_components)) # before returning the model we train on full data hmm_model = GaussianHMM(n_components=current_n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) model_and_scores.append([hmm_model,np.mean(scores)]) best_model, score = max(model_and_scores, key=lambda item: item[1]) else: try: best_model = hmm_model = GaussianHMM(n_components=self.n_constant, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) except: if self.verbose: print("failure on {} altogether".format(self.this_word)) return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_model = None best_score = float('-inf') for components in range(self.min_n_components, self.max_n_components + 1): try: cv_folds = KFold(n_splits=2) test_scores = [] for cv_train_idx, cv_test_idx in cv_folds.split( self.sequences): X_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) X_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) scored_model = GaussianHMM(n_components=components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, lengths_train) if scored_model: test_scores.append( scored_model.score(X_test, lengths_test)) score = np.mean(test_scores) if score > best_score: best_score = score best_model = scored_model except: pass return best_model
def select(self): #warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=RuntimeWarning) # TODO implement model selection using CV kf = KFold(n_splits=3, shuffle=False, random_state=None) likelihoods = [] cv_scores = [] for states in range(self.min_n_components, self.max_n_components + 1): try: if len(self.sequences) > 2: for train_index, test_index in kf.split(self.sequences): self.X, self.lengths = combine_sequences( train_index, self.sequences) X_test, lengths_test = combine_sequences( test_index, self.sequences) hmm_model = self.base_model(states) log_likelihood = hmm_model.score(X_test, lengths_test) else: hmm_model = self.base_model(states) log_likelihood = hmm_model.score(self.X, self.lengths) likelihoods.append(log_likelihood) score_cvs_avg = np.mean(likelihoods) cv_scores.append(tuple([score_cvs_avg, hmm_model])) except Exception as e: pass return max(cv_scores, key=lambda x: x[0])[1] if cv_scores else None
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV logs_array = [] best_score = -float("Inf") best_model = None for num_states in range(self.min_n_components, self.max_n_components + 1): try: split_method = KFold(n_splits=min(3, len(self.sequences))) #Doing crossvalidation and fitting the model accordingly for cv_train_idx, cv_test_idx in split_method.split( self.sequences): X_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) X_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) model = GaussianHMM(n_components=num_states, n_iter=1000).fit( X_train, lengths_train) logL = model.score(X_test, lengths_test) logs_array.append(logL) mean_score = np.mean(logs_array) #Update the best model if mean_score > best_score: best_score = mean_score best_model = model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) logl_model_list = [] #Instantiate KFold splits = KFold(n_splits=2) for components in range(self.min_n_components, self.max_n_components + 1): try: #Compute Likelihood hmm_model = GaussianHMM(n_components=components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False) log_l_list = [] for cv_train_idx, cv_test_idx in splits.split(self.sequences): #Select indices from split train_X, train_lengths = combine_sequences( cv_train_idx, self.sequences) #Fit model to training data hmm_model.fit(train_X, train_lengths) #Evaluate on test data test_X, test_lengths = combine_sequences( cv_test_idx, self.sequences) log_l = hmm_model.score(test_X, test_lengths) #Cache likelihood log_l_list.append(log_l) #Compute mean likelihood logl_model_list.append((np.mean(log_l_list), hmm_model)) except: continue if logl_model_list: _, model = max(logl_model_list, key=lambda x: x[0]) return model else: return None
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV best_score = float('-inf') best_model = None if len(self.sequences) < 2: return self.base_model(self.n_constant) for num_states in range(self.min_n_components, self.max_n_components + 1): try: scores = [] model = self.base_model(num_states) split_method = KFold(n_splits=min(3, len(self.lengths))) for cv_train_idx, cv_test_idx in split_method.split( self.sequences): X_train, length_train = combine_sequences( cv_train_idx, self.sequences) X_test, length_test = combine_sequences( cv_test_idx, self.sequences) model.fit(X_train, length_train) one_score = model.score(X_test, length_test) scores.append(one_score) score = np.mean(scores) if score > best_score: best_score = score best_model = model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) n_components = None logL = float('-inf') try: split_method = KFold(n_splits=min(3,len(self.sequences))) except: return None for tmp_n_components in range(self.min_n_components, self.max_n_components+1): try: tmp_logL = 0 for cv_train_idx, cv_test_idx in split_method.split(self.sequences): train_x, train_length = combine_sequences(cv_train_idx, self.sequences) test_x, test_length = combine_sequences(cv_test_idx, self.sequences) tmp_hmm_model = GaussianHMM(n_components=tmp_n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_x, train_length) tmp_logL += tmp_hmm_model.score(test_x, test_length) #print('PTCCKVDP: this_word={}, tmp_n_components={}, tmp_logL={}'.format(self.this_word,tmp_n_components,tmp_logL)) if tmp_logL > logL: logL = tmp_logL n_components = tmp_n_components except: #print('JAYLNMOK: fail, this_word={}, tmp_n_components={}'.format(self.this_word,tmp_n_components,tmp_logL)) pass if self.verbose and n_components == None: print('Error MIHMRNLR: this_word={}'.format(self.this_word)) return None try: return GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(self.X, self.lengths) except: return None
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = -float('inf') best_components_number = self.min_n_components best_model = None n_splits = min(len(self.lengths), 3) if n_splits == 1: return self.base_model(self.n_constant) split_method = KFold(n_splits) for components in range(self.min_n_components, self.max_n_components + 1): score = [] iter_counter = 0 try: for cv_train_idx, cv_test_idx in split_method.split( self.sequences): X_train, len_train = combine_sequences( cv_train_idx, self.sequences) X_test, len_test = combine_sequences( cv_test_idx, self.sequences) hmm_model = GaussianHMM(n_components=components, covariance_type='diag', n_iter=1000, random_state=self.random_state, verbose=False).fit( X_train, len_train) iter_counter += 1 score.append(hmm_model.score(X_test, len_test)) score = np.average(score) if score > best_score: best_score = score best_components_number = components best_model = hmm_model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) scores_array = [] best_score = float("-inf") best_model = None for num_states in range(self.min_n_components, self.max_n_components + 1): try: split_method = KFold(n_splits=min(3, len(self.lengths))) for cv_train_idx, cv_test_idx in split_method.split( self.sequences): sequences_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) sequences_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) model = GaussianHMM(n_components=num_states, n_iter=1000, random_state=self.random_state, verbose=False).fit( sequences_train, lengths_train) score = model.score(sequences_test, lengths_test) scores_array.append(score) mean_score = np.mean(scores_array) if mean_score > best_score: best_score = mean_score best_model = model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV max_score = float("-inf") best_model = None split_method = KFold(min(len(self.sequences), 3)) # api default = 3 for n in range(self.min_n_components, self.max_n_components + 1): try: model = self.base_model(n) logLs = [] for cv_train_idx, cv_test_idx in split_method.split( self.sequences): try: train_X, train_lengths = combine_sequences( cv_train_idx, self.sequences) test_X, test_lengths = combine_sequences( cv_test_idx, self.sequences) model.fit(train_X, train_lengths) logL = model.score(test_X, test_lengths) logLs.append(logL) except: pass mean_score = np.mean(logLs) if mean_score > max_score: max_score = mean_score best_model = model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) word_sequences = self.sequences if (len(self.sequences) == 1): split_method = [] else: split_method = KFold(n_splits=min(3, len(self.sequences))) logPXtest = float("-inf") best_num_components = self.n_constant for components in range(self.min_n_components, self.max_n_components + 1): logPxis = [] try: if (len(self.sequences) == 1): model = self.base_model(components) newlogP = model.score(self.X, self.lengths) if newlogP > logPXtest: logPXtest = newlogP best_num_components = components else: for cv_train_idx, cv_test_idx in split_method.split( word_sequences): X_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) X_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) model = self.base_model_CV(components, X_train, lengths_train) logPxis.append(model.score(X_test, lengths_test)) if (np.average(logPxis) > logPXtest): logPXtest = np.average(logPxis) best_num_components = components except: pass return self.base_model(best_num_components)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # DONE : implement model selection using CV best_score = float('-inf') best_model = None # DONE : implement model selection using CV for n in range(self.min_n_components, self.max_n_components + 1): sum_of_logL = 0 number_of_iterations = 0 number_of_splits = min(3, len(self.sequences)) # print(number_of_splits) if number_of_splits > 1: kfold = KFold(n_splits=number_of_splits) for cv_training_set, cv_testing_set in kfold.split( self.sequences): # traning, length_of_training = combine_sequences(cv_training_set, self.sequences) testing, length_of_testing = combine_sequences( cv_testing_set, self.sequences) try: hmm_model = self.base_model(n) logL_score = hmm_model.score(testing, length_of_testing) number_of_iterations = number_of_iterations + 1 except: logL_score = 0 sum_of_logL = sum_of_logL + logL_score if number_of_iterations == 0: score = sum_of_logL else: score = sum_of_logL / number_of_iterations else: try: hmm_model = self.base_model(n) score = hmm_model.score(self.X, self.lengths) except: hmm_model = None score = 0 if score > best_score: best_score = score best_model = hmm_model return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_logL = float('-inf') best_num_components = None for num_states in range(self.min_n_components, self.max_n_components + 1): logL_arr = [] try: split_method = KFold(n_splits=min(len(self.lengths), 3)) for cv_train_idx, cv_test_idx in split_method.split(self.sequences): train_X, train_Xlengths = combine_sequences(cv_train_idx, self.sequences) test_X, test_Xlengths = combine_sequences(cv_test_idx, self.sequences) try: model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(train_X, train_Xlengths) logL_arr.append(model.score(test_X, test_Xlengths)) except: pass if logL_arr: mean_logL = np.mean(logL_arr) else: mean_logL = float('-inf') if mean_logL > best_logL: best_logL = mean_logL best_num_components = num_states except: pass return self.base_model(best_num_components)
def select(self): ''' You need to split the dataset in train and test sets (it will be more than 1 pair of sets- iterate them) Use the training data to fit the model Use the test data to get the score Get the average score of all these scores <-- this will be the components score Keep the number of components of the highest components score. Return a model fitted over all the data with the best number of components ''' warnings.filterwarnings("ignore", category=DeprecationWarning) best_component_score = float('-inf') best_n = self.min_n_components for n in range(self.min_n_components, self.max_n_components + 1): try: split_method = KFold(n_splits=min(len(self.lengths), 3)) scores = [] for cv_train_idx, cv_test_idx in split_method.split( self.sequences): x_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) x_train, lengths_train = combine_sequences( cv_train_idx, self.sequences) model = GaussianHMM(n_components=n, n_iter=1000).fit( x_train, lengths_train) scores.append(model.score(x_test, lengths_test)) component_score = sum(scores) / len(scores) if component_score > best_component_score: best_component_score = component_score best_n = n except: continue return self.base_model(best_n)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_score = float("-inf") #KEEP HIGHEST best_model = None for num_states in range(self.min_n_components, self.max_n_components + 1): scores = [] n_splits = 3 model, logL = None, None # Check Data Amount if len(self.sequences) < n_splits: break split_method = KFold(random_state=self.random_state, n_splits=n_splits) for cv_train_idx, cv_test_idx in split_method.split(self.sequences): # Help from the Forums. x_train, lengths_train = combine_sequences(cv_train_idx, self.sequences) # Split Training sequences x_test, lengths_test = combine_sequences(cv_test_idx, self.sequences) try: model = GaussianHMM(n_components=num_states, n_iter=1000,).fit(x_train, lengths_train) logL = model.score(x_test, lengths_test) scores.append(logL) except Exception as e: break avg = None if len(scores) > 0: avg = np.average(scores) else: avg = float("-inf") if avg > best_score: best_score, best_model = avg, model if best_model is None: return self.base_model(self.n_constant) else: return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV all_n_components = [] split_method = KFold() all_scores = [] # Store each CV value for n_components in range(self.min_n_components, self.max_n_components + 1): try: if len(self.sequences ) > 2: # Check if there are enough data to split scores = [] for cv_train_idx, cv_test_idx in split_method.split( self.sequences): # Prepare training sequences self.X, self.lengths = combine_sequences( cv_train_idx, self.sequences) # Prepare testing sequences X_test, lengths_test = combine_sequences( cv_test_idx, self.sequences) model = self.base_model(n_components) scores.append(model.score(X_test, lengths_test)) all_scores.append(np.mean(scores)) else: model = self.base_model(n_components) all_scores.append(model.score(self.X, self.lengths)) all_n_components.append(n_components) except: # eliminate non-viable models from consideration pass best_num_components = all_n_components[np.argmax( all_scores)] if all_scores else self.n_constant return self.base_model(best_num_components)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # implement model selection using CV best_cv = float('-Inf') best_model = None for p in range(self.min_n_components, self.max_n_components + 1): splits = min(3, len(self.sequences)) if splits < 2: continue split_method = KFold(splits) accum_score = 0 for cv_train_idx, cv_test_idx in split_method.split(self.sequences): self.X, self.lengths = combine_sequences(cv_train_idx, self.sequences) # make the model automatically fit to training test_x, test_length = combine_sequences(cv_test_idx, self.sequences) try: model = self.base_model(num_states=p) accum_score += model.score(test_x, test_length) cv = accum_score / float(splits) # normalize over all model splits if cv > best_cv: best_cv = cv best_model = model except: continue
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV if len(self.lengths) < 3: split_method = KFold(2) else: split_method = KFold() best_model = None highest_score = float('-inf') n_components = range(self.min_n_components, self.max_n_components + 1) for n_component in n_components: try: test_scores = [] for cv_train_idx, cv_test_idx in split_method.split( self.sequences): self.X, self.lengths = combine_sequences( cv_train_idx, self.sequences) model = self.base_model(n_component) test_X, test_lengths = combine_sequences( cv_test_idx, self.sequences) test_score = model.score(test_X, test_lengths) test_scores.append(test_score) score = np.mean(test_scores) if score > highest_score: best_model = model highest_score = score except: continue if best_model: return best_model else: return self.base_model(self.n_constant)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) maxLogL = 0.0 totalLogL = 0.0 split_method = KFold(n_splits = 3, shuffle = False, random_state = None) best_num_components = self.n_constant for n_comp in range(self.min_n_components, self.max_n_components+1): try: for cv_train_idx, cv_test_idx in split_method.split(self.sequences): Xn, LengthsN = combine_sequences(cv_test_idx,self.sequences) self.X, self.lengths = combine_sequences(cv_train_idx,self.sequences) tr_model = self.base_model(n_comp) logL = tr_model.score(Xn, LengthsN) totalLogL = totalLogL + logL avgLogL = logL/totalLogL if avgLogL > maxLogL: best_num_components = n_comp maxLogL = logL except: pass selected = self. base_model(best_num_components) return selected
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) scores = [(float('-inf'),0)] n_splits = 3 split_method = KFold(random_state=self.random_state, n_splits=n_splits) for n_components in range(self.min_n_components, self.max_n_components + 1): try: cv_scores=[] if(len(self.sequences) < n_splits): return self.base_model(self.n_constant) for train_index, test_index in split_method.split(self.sequences): X_train, lengths_train = combine_sequences(train_index, self.sequences) X_test, lengths_test = combine_sequences(test_index, self.sequences) hmm_model = GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=1000, random_state=self.random_state, verbose=False).fit(X_train, lengths_train) logL = hmm_model.score(X_test, lengths_test) cv_scores.append(logL) cv_scores_mean = (np.array(cv_scores)).mean() scores.append((cv_scores_mean, n_components)) except: pass _, best_num_components = max(scores) return self.base_model(best_num_components)
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV best_score = -100000000 best_num_state = 0 best_model = None n_splits = min(3, len(self.sequences)) for n_components in range(self.min_n_components, self.max_n_components + 1): count = 0 total_score = 0 try: splits = KFold(n_splits) current_model = self.base_model(n_components) for cv_train_idx, cv_test_idx in splits.split(self.sequences): cv_train_X, cv_train_len = combine_sequences( cv_train_idx, self.sequences) cv_test_X, cv_test_len = combine_sequences( cv_test_idx, self.sequences) current_model = current_model.fit(cv_train_X, cv_train_len) score = current_model.score(cv_test_X, cv_test_len) total_score += score count += 1 avg_score = total_score / count if avg_score > best_score: best_score = avg_score best_model = current_model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) best_model = None max_score = float('-inf') split_method = KFold(min(3, len(self.sequences[0]))) for number in range(self.min_n_components, self.max_n_components): set_score = [] for cv_train, cv_test in split_method.split(self.sequences[0]): try: X_train, lengths_train = combine_sequences(cv_train, self.sequences) X_test, lengths_test = combine_sequences(cv_train, self.sequences) model = self.base_model(number) model.fit(X_train, lengths_train) logL = model.score(X_test, lengths_test) set_score.append(logL) if np.mean(set_score) > max_score: best_model = model except: pass return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV best_score = float("inf") best_model = self.base_model(self.n_constant) for n in range(self.min_n_components, self.max_n_components + 1): scores = [] n_splits = 2 if (len(self.sequences) < n_splits): break split_method = KFold(random_state=self.random_state, n_splits=n_splits) try: for train_idx, test_idx in split_method.split(self.sequences): X_train, len_train = combine_sequences( train_idx, self.sequences) X_test, len_test = combine_sequences( test_idx, self.sequences) model = self_base_model(n) X, l = combine_sequences(test_idx, self.sequences) scores.append(model.score(X, l)) score = np.mean(scores) if score > best_score: best_score = score best_model = model except Exception as e: continue return best_model
def select(self): warnings.filterwarnings("ignore", category=DeprecationWarning) # TODO implement model selection using CV # raise NotImplementedError best_cv = float('-inf') best_model = None split_method = KFold() for n in range(self.min_n_components, self.max_n_components + 1): try: scores = [] for train_index, test_index in split_method(self.sequences): self.X, self.lengths = combine_sequences(train_index, self.sequences) train_model = self.base_model(n) X_test, lengths_test = combine_sequences(test_index, self.sequences) scores.append(train_model.score(X_test, lengths_test)) cv = np.mean(scores) if cv > best_cv: best_cv = cv best_model = model except: continue return best_model if best_model else self.base_model(self.n_constant)