def find_best_feature(selected_features, X_train, C_param, y_train, feature): # decide on which features we are using (selected_features + feature) features_in_use = np.append(selected_features, feature) X_train_filt = X_train[:, features_in_use] X_train_filt_ranked = bin_rank(X_train_filt) # Fit a Logistic regression model lr = CategoricalNB(alpha=1.0, fit_prior=True, class_prior=None, min_categories=5).fit(X_train_filt_ranked, y_train) #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train) # Get the accuracy rate using the validation set accu_train = lr.score(X_train_filt_ranked, y_train) # Tuple format = (feature number, training accuracy found) return (feature, accu_train)
def selected_feature_check(data, X_train, y_train, selected_features, selected_features_by_name, C_param, best_feature): # Save the previous selected features to check later if we have made a change prev_selected_features = selected_features.copy() prev_selected_features_by_name = selected_features_by_name.copy() # Add the best feature to the list selected_features = np.append(selected_features, best_feature) selected_features_by_name.append(data.columns[best_feature]) feature_removal_score = {} # Iterate through each feature in the list and remove it for feature in selected_features: temp_features = np.setdiff1d(selected_features, np.array([feature])) X_train_filt = X_train[:, temp_features] X_train_filt_ranked = bin_rank(X_train_filt) # Fit a Logistic regression model lr = CategoricalNB(alpha=1.0, fit_prior=True, class_prior=None, min_categories=5).fit(X_train_filt_ranked, y_train) #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train) # Get the accuracy rate using the validation set accu_train = lr.score(X_train_filt_ranked, y_train) feature_removal_score[feature] = accu_train # Get the feature which causes the highest accuracy without it max_key = max(feature_removal_score, key=lambda k: feature_removal_score[k]) selected_features = np.setdiff1d(selected_features, np.array([max_key])) max_key_name = data.columns[max_key] selected_features_by_name = list( set(selected_features_by_name) - set([max_key_name])) # Check if we have made any changes, if not let the caller know that this function is no longer needed if np.array_equal(selected_features, prev_selected_features): print( "-----------------------------------------------------------------------------------------------" ) print( "We have found the unchanged set, thus from now on we are just adding to our selected features.\n" ) print( "-----------------------------------------------------------------------------------------------" ) return max_key, selected_features, selected_features_by_name, True else: return max_key, selected_features, selected_features_by_name, False
NB.fit(X_train_new, y_train) prediction['Naive Bayes'] = NB.predict(X_test_new) #accuracy, precision, recall, confusion matrix print("Acurracy:") print(accuracy_score(y_test, prediction['Naive Bayes'])) print("\n") print("Classfication report:") print(classification_report(y_test, prediction['Naive Bayes'])) print("\n") print("Confusion Matrix:") print(confusion_matrix(y_test, prediction['Naive Bayes'])) #scoring with train data print('train score:', NB.score(X_train_new, y_train)) # scoring with test data print('test score:', NB.score(X_test_new, y_test)) NB.predict_proba(X_test_new) """# Random Forest""" #reassemble training dataset #for numerical features, use the ones selected in Logistic Regression #for categorical features, use the datasets after applying label encoding #training dataset x = X_sm_num.drop(columns=['hour', 'N1', 'N2', 'N5', 'N6', 'N7'], axis=1) train1 = x.join(X_sm_c1).join(X_sm['newlabel']) train1
#[1. 1. 1. 1. 1.] mlp1.fit(x_train, y_train) mlpscores = cross_val_score(mlp1, x_train, y_train, cv=5) print("MLPClassifier Cross Validation Attempt 1: " + str(mlpscores)) #[0.98242531 0.99032542 0.98504837 0.9876869 0.98416887] mlp2 = MLPClassifier(max_iter=8) mlp2.fit(x_train, y_train) mlpscores = cross_val_score(mlp2, x_train, y_train, cv=5) print("MLPTreeClassifier Cross Validation Attempt 2: " + str(mlpscores)) #max_iter 값을 8까지 늘리면 Cross-Val-Score가 전보다 증가하는 것을 볼 수 있습니다. #[0.99912127 0.99824099 0.99912049 0.99912049 0.99736148] cnb1.fit(x_train, y_train) cnb1.score(x_test, y_test) cnbscores = cross_val_score(cnb1, x_train, y_train, cv=5) print("CategoricalNB Cross Validation Attempt 1: " + str(cnbscores)) #[0.94112478 0.94547054 0.93755497 0.93491645 0.92524186] cnb2 = CategoricalNB(alpha=0.2) cnb2.fit(x_train, y_train) cnbscores = cross_val_score(cnb2, x_train, y_train, cv=5) print("CategoricalNB Cross Validation Attempt 2: " + str(cnbscores)) #alpha값을 0.2로 조정했을 때 Cross-Val-Score가 증가하는 것을 볼 수 있습니다. #[0.95342707 0.96042216 0.95602463 0.95074758 0.93667546] rfc1.fit(x_train, y_train) rfcscores = cross_val_score(rfc1, x_train, y_train, cv=5) print("RandomForestClassifer Cross Validation Attempt 1: " + str(rfcscores)) #[0.99472759 0.99560246 0.99472296 1. 0.99472296]
def get_solid_20(data, X_train, y_train, X_val, y_val, thread_Pool, selected_features=[], selected_features_by_name=[], C_param=1): # Getting all the features and remembering to remove the isLumA column features = [i for i in range(data.shape[1] - 1)] # Removing from the available features what we already have in the selected features features = np.setdiff1d(features, selected_features) # Maaking sure that are selected features contain what we already have selected_features = selected_features selected_features_by_name = selected_features_by_name highest_accu = 0 best_feature = None selected_features_dict = [] # Boolean flag indicating whether we are to continue trying to swap out the 20 original features isFinishedSwapping = False isExit = False i = 0 # IIterations for amount of rounds we want to prefrorm (Note: max_rounds != number of features we will have at the end) while True: # Keeping track of time, to monitor how long it takes for each iteration start = time.time() best_feature = None highest_accu = 0 result = [] # Create a thread pool of thread_Pool amount of process pool = multiprocessing.Pool(thread_Pool) # Creating a dummy function in order to send multiple inputs to the function func = partial(find_best_feature, selected_features, X_train, C_param, y_train) # Gather the results result = pool.map(func, features) pool.close() pool.join() #Iterate through the results, find the best_feature for res in result: if res[1] > highest_accu: highest_accu = res[1] best_feature = res[0] # Add the best feature to the list if not isFinishedSwapping: # We still havent converged in our starting selected features, so we run the function to updated the selected features best_feature, selected_features, selected_features_by_name, isFinishedSwapping = selected_feature_check( data, X_train, y_train, selected_features, selected_features_by_name, C_param, best_feature) if isFinishedSwapping == True: isExit = True else: isExit = True # We have converged and now just the best feature selected_features = np.append(selected_features, best_feature) selected_features_by_name.append(data.columns[best_feature]) # Train the model again with these selected features X_train_filt = X_train[:, selected_features] X_val_filt = X_val[:, selected_features] # Rank the data X_train_filt_ranked = bin_rank(X_train_filt) X_val_filt_ranked = bin_rank(X_val_filt) # Train the model lr = CategoricalNB(alpha=1.0, fit_prior=True, class_prior=None, min_categories=5).fit(X_train_filt_ranked, y_train) #lr = LogisticRegression(C=C_param, random_state=0).fit(X_train_filt_ranked, y_train) # Measure Validation accuracy with the features accu_val = lr.score(X_val_filt_ranked, y_val) # Remove the feature found from the list of features features = np.setdiff1d(features, np.array([best_feature])) # Measure time times = time.time() - start # Populate dictionary of info and add it to our list and continue selected_features_dict.append({ "Feature": selected_features_by_name.copy(), "Iteration": i + 1, "Training accuracy": highest_accu, "Validation Accuracy": accu_val, "Time": times }) print("Round:", i + 1, "complete") i = i + 1 if isExit: break print("Finished: Total time can be found below:") # return a dictionary return (selected_features_dict, highest_accu)
def naive_bayes(x_train, y_train): model = CategoricalNB() model.fit(x_train, y_train) score = model.score(x_train, y_train) return score
import numpy as np from sklearn.datasets import make_blobs from sklearn.naive_bayes import MultinomialNB, CategoricalNB from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import brier_score_loss X, y = make_blobs(n_samples=[500, 500], centers=[[0.0, 0.0], [2.0, 2.0]], cluster_std=[0.5, 0.5], random_state=0, shuffle=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=420) # 分箱,将数据转换为分类型 from sklearn.preprocessing import KBinsDiscretizer kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') kbd.fit(X_train) X_train_ = kbd.transform(X_train) X_test_ = kbd.transform(X_test) print('分类值建模') cnb = CategoricalNB() cnb.fit(X_train_, y_train) print('test accuracy: {}'.format(cnb.score(X_test_, y_test))) print('test brier_score_loss: {}'.format( brier_score_loss(y_test, cnb.predict_proba(X_test_)[:, 1], pos_label=1)))
def scoring_comparison(base_path,datasets,verbose=1,test_size=0.3,seed=None,n_iterations=30): column_names = ["dataset", "custom_training_score", "custom_test_score", "categorical_training_score", "categorical_test_score"] data =[] clf_no_encoding = NaiveBayes(encode_data=True) clf_categorical_sklearn = CategoricalNB() datasets_iter = tqdm(datasets, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}') c = CustomOrdinalFeatureEncoder() l = CustomLabelEncoder() for dataset in datasets_iter: dataset_name, label = dataset data_filename = f"{dataset_name}.data.csv" test_filename = f"{dataset_name}.test.csv" X, y = get_X_y_from_database(base_path=base_path, name = dataset_name, data = data_filename, test = test_filename, label = label) custom_train = [] custom_test = [] sklearn_train = [] sklearn_test = [] X = c.fit_transform(X) y = l.fit_transform(y) for iteration in range(n_iterations): if verbose: datasets_iter.set_postfix({"Dataset": dataset_name, "seed":iteration}) datasets_iter.refresh() try: X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True, stratify=y) except: #Not enough values to stratify y X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size, random_state=seed+iteration, shuffle=True ) #Fit clf_no_encoding.fit(X_train,y_train) clf_categorical_sklearn.min_categories = [1+np.max(np.concatenate([X_train[:,j],X_test[:,j]])) for j in range(X_train.shape[1])] clf_categorical_sklearn.fit(X_train,y_train) #Predict custom_train.append(clf_no_encoding.score(X_train,y_train)) custom_test.append(clf_no_encoding.score(X_test,y_test)) sklearn_train.append(clf_categorical_sklearn.score(X_train,y_train)) sklearn_test.append(clf_categorical_sklearn.score(X_test,y_test)) data.append([dataset_name,np.mean(custom_train),np.mean(custom_test),np.mean(sklearn_train),np.mean(sklearn_test)]) return pd.DataFrame(data,columns = column_names)
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.naive_bayes import CategoricalNB from sklearn.model_selection import train_test_split df = pd.read_csv('https://raw.githubusercontent.com/grbruns/cst383/master/heart.csv') df['output'] = df['output'] - 1 predictors = ['chestpain', 'exercise'] X = df[predictors].values y = df['output'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) blind_prediction = np.median(y_train) #median? i thought blind was mean? print((y_test == blind_prediction).mean()) clf = CategoricalNB() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_prob = clf.predict_proba(X_test) # why do we want probability/what probability clf.score(X_test, y_test) df.describe()
X['Sex']=X['Sex'].apply(lambda x: 1 if x=='female' else 0) X.head() X.columns[X.isna().any()] X.describe() X.fillna(X.mean(), inplace=True) X.describe() X_train, X_test, y_train,y_test = train_test_split(X,Y, test_size=0.2) model = CategoricalNB() model.fit(X_train, y_train) model.score(X_test, y_test) model.score(X_train, y_train) data =pd.read_csv('spam.csv.xlsx') data.head() from sklearn.feature_extraction.text import CountVectorizer data.describe() data.groupby('Category').describe() data['spam']=data['Category'].apply(lambda x: 1 if x=='spam' else 0) data.head() X_train, X_test, y_train, y_test = train_test_split(data.Message, data.spam) v = CountVectotrizer() X_train_count = v.fit_transform(X_train.values) X_train_count.toarray([:2])