def test_cnb(): # Tests ComplementNB when alpha=1.0 for the toy example in Manning, # Raghavan, and Schuetze's "Introduction to Information Retrieval" book: # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html # Training data points are: # Chinese Beijing Chinese (class: China) # Chinese Chinese Shanghai (class: China) # Chinese Macao (class: China) # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo. X = np.array([[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]) # Classes are China (0), Japan (1). Y = np.array([0, 0, 0, 1]) # Verify inputs are nonnegative. clf = ComplementNB(alpha=1.0) assert_raises(ValueError, clf.fit, -X, Y) clf.fit(X, Y) # Check that counts are correct. feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]]) assert_array_equal(clf.feature_count_, feature_count) class_count = np.array([3, 1]) assert_array_equal(clf.class_count_, class_count) feature_all = np.array([1, 4, 1, 1, 1, 1]) assert_array_equal(clf.feature_all_, feature_all) # Check that weights are correct. See steps 4-6 in Table 4 of # Rennie et al. (2003). theta = np.array([ [ (0 + 1) / (3 + 6), (1 + 1) / (3 + 6), (1 + 1) / (3 + 6), (0 + 1) / (3 + 6), (0 + 1) / (3 + 6), (1 + 1) / (3 + 6) ], [ (1 + 1) / (6 + 6), (3 + 1) / (6 + 6), (0 + 1) / (6 + 6), (1 + 1) / (6 + 6), (1 + 1) / (6 + 6), (0 + 1) / (6 + 6) ]]) weights = np.zeros(theta.shape) for i in range(2): weights[i] = np.log(theta[i]) weights[i] /= weights[i].sum() assert_array_equal(clf.feature_log_prob_, weights)
test_docs = preprocess(test_docs) # create a vectorizer object tfidf_vectorizer = TfidfVectorizer( analyzer = "word", stop_words = stopwords.words('english'), max_df = 0.7, max_features = 10000) # create sparse matrix representation of documents vect_train_docs = tfidf_vectorizer.fit_transform(train_docs) vect_test_docs = tfidf_vectorizer.transform(test_docs) # classifier classifier = OneVsRestClassifier(ComplementNB()) classifier.fit(vect_train_docs, train_labels) # get predictions using trained classifier predictions = classifier.predict(vect_test_docs) # metrics precision = precision_score(test_labels, predictions, average='micro') recall = recall_score(test_labels, predictions, average='micro') f1 = f1_score(test_labels, predictions, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1)) precision = precision_score(test_labels, predictions, average='macro') recall = recall_score(test_labels, predictions, average='macro')
y_pred_DTC_gini = clf_gini.fit(X_train, y_train).predict(X_test) y_pred_DTC_entropy = clf_entropy.fit(X_train, y_train).predict(X_test) print("Accuracy of DTC (gini): ", metrics.accuracy_score(y_test, y_pred_DTC_gini) * 100) print("Accuracy of DTC (entropy):", metrics.accuracy_score(y_test, y_pred_DTC_entropy) * 100) #KNN knn = KNeighborsClassifier(n_neighbors=25) y_pred_knn = knn.fit(X_train, y_train).predict(X_test) print("Accuracy of KNN:", metrics.accuracy_score(y_test, y_pred_knn) * 100) #NAIVE BAYES gnb = GaussianNB() mnb = MultinomialNB() cnb = ComplementNB() y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test) y_pred_mnb = mnb.fit(X_train, y_train).predict(X_test) y_pred_cnb = cnb.fit(X_train, y_train).predict(X_test) print("Accuracy of GNB:", metrics.accuracy_score(y_test, y_pred_gnb) * 100) print("Accuracy of MNB:", metrics.accuracy_score(y_test, y_pred_mnb) * 100) print("Accuracy of CNB:", metrics.accuracy_score(y_test, y_pred_cnb) * 100) #Logistic Regression lr = LogisticRegression(random_state=0) y_pred_lc = lr.fit(X_train, y_train).predict(X_test) print("Accuracy of Logistic Regression:", metrics.accuracy_score(y_test, y_pred_lc) * 100)
print('=' * 80) print("Elastic-Net penalty") results.append( benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) results.append(benchmark(ComplementNB(alpha=.1))) print('=' * 80) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append( benchmark( Pipeline([ ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))), ('classification', LinearSVC(penalty="l2")) ]))) # make some plots
###################### mnb ############################################--Code from ASTD classifier = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) classifier.fit(X_train, Y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2] total_predictions_made = np.sum(cm) accuracy = total_correct_predictions / total_predictions_made * 100 ##################### CNB ###################### from sklearn.naive_bayes import ComplementNB classifier = ComplementNB() #ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False) classifier.fit(X_train, Y_train) y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2] total_predictions_made = np.sum(cm) accuracy = total_correct_predictions / total_predictions_made * 100 #print(clf.predict(X[2:3])) ################# sgd ###############################################--Code from ASTD classifier = SGDClassifier(loss="hinge", penalty="l2") classifier.fit(X_train, Y_train) # Predicting the Test set results
from sklearn.naive_bayes import ( BernoulliNB, ComplementNB, MultinomialNB, ) from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis classifiers = { "BernoulliNB": BernoulliNB(), "ComplementNB": ComplementNB(), "MultinomialNB": MultinomialNB(), "KNeighborsClassifier": KNeighborsClassifier(), "DecisionTreeClassifier": DecisionTreeClassifier(), "RandomForestClassifier": RandomForestClassifier(), "LogisticRegression": LogisticRegression(), "MLPClassifier": MLPClassifier(max_iter=1000), "AdaBoostClassifier": AdaBoostClassifier(), } def is_positive(tweet: str) -> bool: """True if tweet has positive compound sentiment, False otherwise""" sia = SentimentIntensityAnalyzer() return sia.polarity_scores(tweet)['compound'] > 0
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB, ComplementNB from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer from sklearn.metrics import brier_score_loss, roc_auc_score, recall_score from time import time import datetime X, y = make_blobs(n_samples=[50000, 500], centers=[[0.0, 0.0], [5.0, 5.0]], cluster_std=[3, 1], random_state=0, shuffle=False ) name = ["Multinomial","Gaussian","Bernoulli","Complement"] models = [MultinomialNB(),GaussianNB(),BernoulliNB(),ComplementNB()] for name, clf in zip(name,models): times = time() Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3,random_state=420) #预处理 if name!= "Gaussian": kbs = KBinsDiscretizer(n_bins=10, encode='onehot').fit(Xtrain) Xtrain = kbs.transform(Xtrain) Xtest = kbs.transform(Xtest) clf.fit(Xtrain,Ytrain) y_pred = clf.predict(Xtest) proba = clf.predict_proba(Xtest)[:,1] score = clf.score(Xtest,Ytest) print(name)
from sklearn import tree import numpy import pandas import matplotlib.pyplot as plt import graphviz from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB data = pandas.read_csv('iris.csv', header=None) Y = numpy.asarray(data[data.columns[-1]]) X = numpy.asarray(data[data.columns[0:-1]]) clf = tree.DecisionTreeClassifier(max_depth=4) GNB = GaussianNB() MNB = MultinomialNB() CNB = ComplementNB() print('clf') scores = cross_val_score(clf, X, Y, cv=5) print(scores) clf.fit(X, Y) print(clf.score(X, Y)) print('GNB') scores = cross_val_score(GNB, X, Y, cv=5) print(scores) GNB.fit(X, Y) print(GNB.score(X, Y)) print('MNB') scores = cross_val_score(MNB, X, Y, cv=5) print(scores)
def fit(): # TODO: test content type and send 400 if not JSON # Construct the model fit request data = request.get_json() params = data.get("model", {}) dataset = data.get("dataset", []) grid = data.get("grid", []) model = { 'gaussiannb': GaussianNB(), 'multinomialnb': MultinomialNB(), 'bernoullinb': BernoulliNB(), 'complementnb': ComplementNB(), 'svm': SVC(), 'logit': LogisticRegression(), }.get(params.pop("model", None), None) # Validate the request is correct and sane if model is None or len(dataset) == 0: return "invalid fit request: please specify model and data", 400 # Parse the JSON hyperparameters or leave as string for type detection for key in params.keys(): try: params[key] = json.loads(params[key]) except json.decoder.JSONDecodeError: continue # Set the hyperparameters on the model try: model.set_params(**params) except ValueError as e: return str(e), 400 # Construct the dataset X, y = [], [] for point in dataset: X.append([point["x"], point["y"]]) y.append(point["c"]) X, y = asarray(X), asarray(y) # Fit the model to the dataset and get the training score model.fit(X, y) yhat = model.predict(X) metrics = prfs(y, yhat, average="macro") # Make probability predictions on the grid to implement contours # The returned value is the class index + the probability # To get the selected class in JavaScript, use Math.floor(p) # Where p is the probability returned by the grid. Note that this # method guarantees that no P(c) == 1 to prevent class misidentification Xp = asarray([ [point["x"], point["y"]] for point in grid ]) preds = [] for proba in model.predict_proba(Xp): c = np.argmax(proba) preds.append(float(c+proba[c])-0.000001) return jsonify({ "metrics": dict(zip(["precision", "recall", "f1", "support"], metrics)), "grid": preds, })
guassian_predictions = model.predict(X2_test) print("Guassian results:") print(confusion_matrix(y_test, guassian_predictions)) print("\n") #Performs much worse than Multinomial NB - leave this here. model = MultinomialNB() model.fit(X1_train, y_train) model.predict(X1_test) nb_prediction = model.predict(X1_test) print("Naive Bays TFIDF results:") print(confusion_matrix(y_test, nb_prediction)) print("\n") #Using TFIDF reduces accuracy - as expected. Continue with normal data model = ComplementNB() model.fit(X2_train, y_train) model.predict(X2_test) nb_prediction = model.predict(X2_test) print("Complement NB results:") print(confusion_matrix(y_test, nb_prediction)) print("\n") #Complement NB works as well as multinomial and distribution seems more equal #This makes sense as its design is for imbalanced datasets model = svm.SVC(decision_function_shape='ovo') model.fit(X2_train, y_train) model.predict(X2_test) nb_prediction = model.predict(X2_test) print("SVM results:") print(confusion_matrix(y_test, nb_prediction))
# ******************* Decision Tree Classifier (Gini) ************************ model = tree.DecisionTreeClassifier() run_model(model, 'Decision Tree') # dot_data = tree.export_graphviz(model, out_file=None) # graph = graphviz.Source(dot_data) # graph.render("yelp_decision_tree") # ******************* Gaussian Naive Bayes ************************* run_model(GaussianNB(), 'Gaussian Naive Bayes') # ******************* Multinomial Naive Bayes *********************** run_model(MultinomialNB(), 'Multinomial Naive Bayes') # ******************* Complement Naive Bayes ************************ run_model(ComplementNB(), 'Complement Naive Bayes') # ******************* Bernoulli Naive Bayes ************************* run_model(BernoulliNB(), 'Bernoulli Naive Bayes') # ******************* KNN ************************ knn_range = range(3, 12) for k in knn_range: print('Iteration: ' + str(k)) knn_model = KNeighborsClassifier(n_neighbors=k) run_model(knn_model, ' KNN({} neighbors) '.format(k)) # ******************* SVM(linear) ************************ # svm_model = svm.SVC(kernel='linear') # run_model(svm_model, ' SVM(Linear) ') #
y_pred = model.fit(X_train, y_train).predict(X_test) print("error rate dev GaussianNB {0}".format(1-(y_test != y_pred).sum()/y_test.shape[0] )) task = 'MultinomialNB' model = MultinomialNB() y_pred = model.fit(X_train, y_train).predict(X_test) print("error rate dev MultinomialNB {0}".format(1-(y_test != y_pred).sum()/y_test.shape[0] )) task = 'BernoulliNB' model = BernoulliNB() y_pred = model.fit(X_train, y_train).predict(X_test) print("error rate dev BernoulliNB {0}".format(1-(y_test != y_pred).sum()/y_test.shape[0] )) """ task = 'ComplementNB' model = ComplementNB() y_pred = model.fit(X_train, y_train).predict(X_test) print( "error rate dev ComplementNB {0}".format(1 - (y_test != y_pred).sum() / y_test.shape[0])) elif (sys.argv[1] == 'D'): from sklearn import tree task = 'DecisionTree' model = tree.DecisionTreeClassifier(criterion='gini', max_depth=80, min_samples_split=2, min_samples_leaf=2, min_weight_fraction_leaf=0.0, random_state=None, max_leaf_nodes=200, class_weight=None)
def test_complement_nb(self): self.check_model(ComplementNB(), abs=True) model_name = 'complement-nb.json' self.check_model_json(ComplementNB(), model_name, abs=True)
from sklearn.neighbors import NearestCentroid from sklearn.neighbors import RadiusNeighborsClassifier from sklearn.semi_supervised import LabelPropagation from sklearn.semi_supervised import LabelSpreading from sklearn.discriminant_analysis import LinearDiscriminantAnalysis now = datetime.datetime.now() time_stamp = now.strftime("%Y_%b_%d_%H_%M") print('Training Stamp:' + time_stamp) mnb = MultinomialNB(alpha=0.01) bnb = BernoulliNB() gnb = GaussianNB() cnb = ComplementNB() svc = SGDClassifier(max_iter=1000, tol=1e-3, fit_intercept=True) lda = LinearDiscriminantAnalysis(solver='svd') knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) nc = NearestCentroid() rnc = RadiusNeighborsClassifier(n_jobs=-1) lpg = LabelPropagation(n_jobs=-1) lps = LabelSpreading(n_jobs=-1) dct = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', random_state=9)
name="NearestCentroid (aka Rocchio classifier)" ) # Train sparse Naive Bayes classifiers benchmark( MultinomialNB(alpha=.01), name="Naive Bayes MultinomialNB" ) benchmark( BernoulliNB(alpha=.01), name="Naive Bayes BernoulliNB" ) benchmark( ComplementNB(alpha=.1), name="Naive Bayes ComplementNB" ) # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. benchmark( Pipeline([ ('feature_selection', SelectFromModel( LinearSVC( penalty="l1", dual=False, tol=1e-3 ) )),
def plot_ml_model(X, y, fold): pyplot.close('all') #print ("Enter") #algos = ["SVM-linear","SVM-Kernel","GaussianNB","BernoulliNB","ComplementNB","DTree-gini","DTree-entropy","RF-50","RF-100","RF-150", "KNN-2", "KNN-6"] algos = [ "SVM-linear", "SVM-Kernel", "GaussianNB", "ComplementNB", "DTree-gini", "DTree-entropy", "RF-50", "RF-100", "KNN-2", "KNN-6" ] clfs = [ SVC(kernel='linear'), SVC(kernel='rbf'), GaussianNB(), #BernoulliNB(), ComplementNB(), DecisionTreeClassifier(criterion="entropy", max_depth=24, min_samples_split=2), DecisionTreeClassifier(criterion="entropy", max_depth=24, min_samples_split=2), RandomForestClassifier(n_estimators=50), RandomForestClassifier(n_estimators=100), #RandomForestClassifier(n_estimators = 150), KNeighborsClassifier(n_neighbors=2), KNeighborsClassifier(n_neighbors=6) ] cv_results = [] scoring = 'accuracy' #scoring = 'roc_auc' for classifiers in clfs: cv_score = cross_val_score(classifiers, X, y, cv=fold, scoring=scoring) cv_results.append(cv_score.mean()) cv_mean = pd.DataFrame(cv_results, index=algos) cv_mean.columns = ["Accuracy"] print(cv_mean.sort_values(by="Accuracy", ascending=False)) cv_mean.plot.bar(figsize=(10, 5)) #scatter plot scores = cv_mean["Accuracy"] #create traces trace1 = go.Scatter(x=algos, y=scores, name='Algortms Name', marker=dict(color='rgba(0,255,0,0.5)', line=dict(color='rgb(0,0,0)', width=2)), text=algos) data = [trace1] layout = go.Layout(barmode="group", xaxis=dict(title='ML Algorithms', ticklen=5, zeroline=False), yaxis=dict(title='Prediction Scores', ticklen=5, zeroline=False)) fig = go.Figure(data=data, layout=layout) iplot(fig) pyplot.show()
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams)
vocab = vectorizer.get_feature_names() print('trained and transformed w/ vectorizer') dump(vectorizer, 'vectorizer.joblib') # model training log_reg = LogisticRegression() log_reg.fit(train_data_features, y_train) lr_preds = log_reg.predict(test_data_features) # keep the knn, it's the best knn = KNeighborsClassifier() knn.fit(train_data_features, y_train) knn_preds = knn.predict(test_data_features) dump(knn, 'knn.joblib') cnb = ComplementNB() cnb.fit(train_data_features, y_train) cnb_preds = cnb.predict(test_data_features) # make df with all preds df = pd.DataFrame( list(zip(cnb_preds, lr_preds, knn_preds, y_test, x_test)), columns=['cnb_preds', 'lr_preds', 'knn_preds', 'category', 'document']) # save incorrect predictions in a df to look at lr_incorrect = df[df['lr_preds'] != df['category']].copy() knn_incorrect = df[df['knn_preds'] != df['category']].copy() cnb_incorrect = df[df['cnb_preds'] != df['category']].copy() # combine lr and knn incorrects two_incorrect = knn_incorrect[
"""###**Multinomial Naive Bayes**""" kfold = KFold(n_splits=10) MNB = MultinomialNB() results = cross_val_score(MNB, X_train_smote.todense(), y_train_smote, cv=kfold, scoring='accuracy') print("Training Accuracy: %.3f" % (results.mean()*100.0)) MNB.fit(X_train_smote.todense(), y_train_smote) predictions = MNB.predict(X_test) print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0)) print('Confusion Matrix:\n',confusion_matrix(y_test, predictions)) print('Classification report\n', classification_report(y_test, predictions)) """###**Complement Naive Bayes**""" kfold = KFold(n_splits=10) CNB = ComplementNB() results = cross_val_score(CNB, X_train_smote.todense(), y_train_smote, cv=kfold, scoring='accuracy') print("Training Accuracy: %.3f" % (results.mean()*100.0)) CNB.fit(X_train_smote.todense(), y_train_smote) predictions = CNB.predict(X_test) print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0)) print('Confusion Matrix:\n',confusion_matrix(y_test, predictions)) print('Classification report\n', classification_report(y_test, predictions)) """###**Bernoulli Naive Bayes**""" kfold = KFold(n_splits=10) BNB = BernoulliNB() results = cross_val_score(BNB, X_train_smote.todense(), y_train_smote, cv=kfold, scoring='accuracy') print("Training Accuracy: %.3f" % (results.mean()*100.0)) BNB.fit(X_train_smote.todense(), y_train_smote)
pickle.dump(mnb_model, open("mnb_model_tfidf.sav", 'wb')) pred_mnb = mnb_model.predict(q_test) #evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations #use multilabel_evaluation() mnb_evaluation_scores, mnb_cm = evaluation.multilabel_evaluation_multilabelbinarizer( d_test, label_encoder.inverse_transform(pred_mnb), "Multinomial Naive Bayes") #mnb_evaluation_scores, mnb_cm = evaluation.multilabel_evaluation( # d_test, label_encoder.inverse_transform(pred_mnb), "Multinomial Naive Bayes") documentation_file_modelopt.write(mnb_evaluation_scores) # Complement Naive Bayes: optimizing parameters with grid search print("Complement Naive Bayes model evaluation") cnb_dict = dict(estimator__alpha=[1, 2, 5, 10, 50]) classifier_cnb = RandomizedSearchCV(estimator=OneVsRestClassifier( ComplementNB()), param_distributions=cnb_dict, n_iter=5, n_jobs=1) classifier_cnb.fit(q_train, d_train_encoded) documentation_file_parameteropt.write( "Complement Naive Bayes: Best parameters {}, reached score: {} \n".format( classifier_cnb.best_params_, classifier_cnb.best_score_)) cnb_model = classifier_cnb.best_estimator_ pickle.dump(cnb_model, open("cnb_model_tfidf.sav", 'wb')) pred_cnb = cnb_model.predict(q_test) #evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations #use multilabel_evaluation() cnb_evaluation_scores, cnb_cm = evaluation.multilabel_evaluation_multilabelbinarizer( d_test, label_encoder.inverse_transform(pred_cnb), "Complement Naive Bayes")
def iterate_by_randomsearch(train_x, train_y): classifiers = [ # thsis is for anomaly detection (IsolationForest(),{"n_estimators":50, # "contamination":np_uniform(0., 0.5), # "behaviour":["old", "new"], # "bootstrap": [True, False], # "max_features": sp.stats.randint(1, 7), # "min_samples_split": sp.stats.randint(2, 11)}), # this is for outlier detection (RadiusNeighborsClassifier(), {"radius": sp.stats.uniform(0.5, 5), # "algorithm": ["ball_tree", "kd_tree", "brute"], # "leaf_size": sp.stats.randint(20, 100), # "p": [1, 2]}) (AdaBoostClassifier(), { "n_estimators": sp.stats.randint(25, 100) }), (BaggingClassifier(), { "n_estimators": sp.stats.randint(25, 100), "max_features": sp.stats.randint(1, 7), "bootstrap": [True, False], "bootstrap_features": [True, False], }), (ExtraTreesClassifier(), { "n_estimators": sp.stats.randint(25, 100), "max_depth": [3, None], "max_features": sp.stats.randint(1, 7), "min_samples_split": sp.stats.randint(2, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] }), (GradientBoostingClassifier(), { "n_estimators": sp.stats.randint(25, 100), "loss": ["deviance", "exponential"], "max_features": sp.stats.randint(1, 7), "min_samples_split": sp.stats.randint(2, 11), "criterion": ["friedman_mse", "mse", "mae"], "max_depth": [3, None] }), (RandomForestClassifier(), { "n_estimators": sp.stats.randint(25, 100), "max_depth": [3, None], "max_features": sp.stats.randint(1, 7), "min_samples_split": sp.stats.randint(2, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"] }), (GaussianProcessClassifier(), {}), (LogisticRegression(), { "max_iter": sp.stats.randint(0, 100), "solver": ["lbfgs", "sag", "saga"] }), (PassiveAggressiveClassifier(), { "max_iter": sp.stats.randint(0, 1230), "tol": sp.stats.uniform(0.0001, 0.05) }), (RidgeClassifier(), { "max_iter": sp.stats.randint(0, 2000), "tol": sp.stats.uniform(0.0001, 0.05), "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] }), (SGDClassifier(), { "max_iter": sp.stats.randint(0, 2000), "tol": sp.stats.uniform(0.0001, 0.05), "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], "penalty": ["none", "l2", "l1", "elasticnet"] }), (BernoulliNB(), {}), (MultinomialNB(), {}), (GaussianNB(), {}), (ComplementNB(), {}), (KNeighborsClassifier(), { "n_neighbors": sp.stats.randint(1, 50), "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": sp.stats.randint(20, 100), "p": [1, 2] }), (NearestCentroid(), {}), (MLPClassifier(), { "hidden_layer_sizes": (random.randint(10, 1000), ), "activation": ["identity", "logistic", "tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": sp.stats.uniform(0.00001, 0.001), "learning_rate": ["constant", "invscaling", "adaptive"], "max_iter": sp.stats.randint(0, 2000), "tol": sp.stats.uniform(0.0001, 0.05) }), (DecisionTreeClassifier(), { "max_depth": [3, None], "max_features": sp.stats.randint(1, 7), "min_samples_split": sp.stats.randint(2, 11), "criterion": ["gini", "entropy"] }), (LinearSVC(), { "penalty": ["l2"], "tol": sp.stats.uniform(1e-5, 1e-3), "C": sp.stats.uniform(0.1, 5), "max_iter": sp.stats.randint(0, 2000) }), (NuSVC(), { "gamma": sp.stats.uniform(1e-4, 1e-2), "kernel": ["linear", "poly", "rbf", "sigmoid"], "tol": sp.stats.uniform(1e-4, 1e-2), }), (SVC(), { "gamma": sp.stats.uniform(1e-4, 1e-2), "kernel": ["linear", "poly", "rbf", "sigmoid"], "tol": sp.stats.uniform(1e-4, 1e-2), }), (LinearDiscriminantAnalysis(), { "solver": ["svd", "lsqr", "eigen"], "n_components": random.randint(2, 4), "tol": sp.stats.uniform(1e-5, 1e-2) }), (QuadraticDiscriminantAnalysis(), { "tol": sp.stats.uniform(1e-5, 1e-2) }) ] df = pd.DataFrame( columns=['alg', 'perf', 'est', 'rank', 'mean', 'std', 'parameters']) for clf in classifiers: print(type(clf[0]).__name__, "started at", datetime.now()) n_iter = 10 random_search = RandomizedSearchCV(clf[0], param_distributions=clf[1], n_iter=n_iter, cv=5) start = time() random_search.fit(train_x, train_y) #print("%s RandomizedSearchCV took %.2f seconds for %d candidates" # " parameter settings." % (type(clf[0]).__name__,(time() - start), n_iter)) df = report(df, type(clf[0]).__name__, time() - start, n_iter, random_search.cv_results_) print(df)
def container(train_path, test_path, test_original_path): """ :param train_path: training set path :param test_path: evaluation set path :param test_original_path: evaluation set original path :return: None """ train_features = pd.read_csv(train_path) dev_features = pd.read_csv(test_path) classifier_container = [ {"model": BernoulliNB(alpha=1.0e-10), "name": "BernoulliNB", "params": { "n_estimators": [102], "max_samples": [0.3], "max_features": [0.5], # "n_estimators": range(102, 106, 2), # "max_samples": linspace(0.2, 0.4, 3), # "max_features": linspace(0.3, 0.5, 3), "warm_start": [True] }}, {"model": ComplementNB(alpha=1.0e-10), "name": "ComplementNB", "params": { "n_estimators": [106], "max_samples": [0.2], "max_features": [0.5], # "n_estimators": range(102, 104, 2), # "max_samples": linspace(0.1, 0.3, 3), # "max_features": linspace(0.4, 0.6, 3), "warm_start": [True] }}, {"model": MultinomialNB(alpha=1.0e-10), "name": "MultinomialNB", "params": { "n_estimators": [102], "max_samples": [0.3], "max_features": [0.5], # "n_estimators": range(100, 110, 2), # "max_samples": linspace(0.1, 0.5, 5), # "max_features": linspace(0.2, 0.6, 5), "warm_start": [True] }}, {"model": RandomForestClassifier( n_estimators=106, criterion="entropy" ), "name": "RandomForestClassifier", "params": { "n_jobs": [-1] } }, {"model": DecisionTreeClassifier( criterion="entropy" ), "name": "DecisionTreeClassifier", "params": { }} ] results = [] for idx, clf in enumerate(classifier_container): logging.info("[*] ({1}/{2}) Training with {0} ...".format(clf["name"], idx + 1, len(classifier_container))) bag = BaggingClassifier(base_estimator=clf["model"]) # grid = RandomizedSearchCV(bag, param_dist, cv=42, n_iter=300, scoring='accuracy', n_jobs=-1, verbose=2, refit=True) # grid = GridSearchCV(bag, clf["params"], cv=42, n_jobs=-1, verbose=0, refit=True) grid = GridSearchCV(bag, clf["params"], cv=42, scoring='accuracy', n_jobs=-1, verbose=0, refit=True) grid.fit(train_features.iloc[:, 1:-1], train_features['class'].to_list()) res = grid.best_estimator_.predict_proba(dev_features.iloc[:, 1:-1]) results.append(res) # clf["model"].fit(train_features.iloc[:, 1:-1], train_features['class'].to_list()) # res = clf["model"].predict(dev_features.iloc[:, 1:-1]) # grid.best_estimator_ = clf["model"] # grid.best_params_ = None # grid.scoring = None # grid.cv = None # accuracy, scores = my_score(res, test_original_path, True) accuracy, scores = my_score(argmax(res, axis=1), test_original_path, True) save_model(grid, accuracy, scores) ensemble_res = results[0] for i in range(1, len(results)): ensemble_res += results[i] ensemble_res = argmax(ensemble_res, axis=1).tolist() accuracy, scores = my_score(ensemble_res, test_original_path, True) print("[*] Accuracy: %f" % accuracy) pprint(scores) # save_model(grid, accuracy, scores) print()
print("\nBernoulliNB working...\n") BNB = BernoulliNB() BNB.fit(x_train, y_train.values.ravel()) print("\nAccuracy Score:", accuracy_score(y_test, BNB.predict(x_test))) print("Confusion Matrix:\n") print(confusion_matrix(y_test, BNB.predict(x_test))) if (sel == 3): #MultinomialNB Classifier print("\nMultinomialNB working...\n") MNB = MultinomialNB() MNB.fit(x_train, y_train.values.ravel()) print("\nAccuracy Score:", accuracy_score(y_test, MNB.predict(x_test))) print("Confusion Matrix:\n") print(confusion_matrix(y_test, MNB.predict(x_test))) if (sel == 4): #ComplementNB Classifier print("\nComplementNB working...\n") CNB = ComplementNB() CNB.fit(x_train, y_train.values.ravel()) print("\nAccuracy Score:", accuracy_score(y_test, CNB.predict(x_test))) print("Confusion Matrix:\n") print(confusion_matrix(y_test, CNB.predict(x_test))) if (opt == 4): break
# %% ############################# # Complement Naive Bayes ############################# from sklearn.naive_bayes import ComplementNB # %% tmp = np.unique(np.where(X_train > 0)[0]) X_train_NB = X_train[tmp, :] Y_train_NB = Y_train[tmp] # %% tmp = np.unique(np.where(X_test > 0)[0]) X_test_NB = X_test[tmp, :] Y_test_NB = Y_test[tmp] # %% clf = ComplementNB() clf.fit(X_train_NB, Y_train_NB) # %% pred = clf.predict(X_test) # %% result_NB = np.array( [[np.sum(pred * Y_test), np.sum(pred * (1 - Y_test))], [np.sum((1 - pred) * Y_test), np.sum((1 - pred) * (1 - Y_test))]]) # %% POD_NB = (result_NB[0, 0]) / (result_NB[0, 0] + result_NB[1, 0]) FAR_NB = 1 - (result_NB[0, 0]) / (result_NB[0, 0] + result_NB[0, 1])
def test_cnb(): # Tests ComplementNB when alpha=1.0 for the toy example in Manning, # Raghavan, and Schuetze's "Introduction to Information Retrieval" book: # https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html # Training data points are: # Chinese Beijing Chinese (class: China) # Chinese Chinese Shanghai (class: China) # Chinese Macao (class: China) # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo. X = np.array([[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]) # Classes are China (0), Japan (1). Y = np.array([0, 0, 0, 1]) # Check that weights are correct. See steps 4-6 in Table 4 of # Rennie et al. (2003). theta = np.array([ [ (0 + 1) / (3 + 6), (1 + 1) / (3 + 6), (1 + 1) / (3 + 6), (0 + 1) / (3 + 6), (0 + 1) / (3 + 6), (1 + 1) / (3 + 6) ], [ (1 + 1) / (6 + 6), (3 + 1) / (6 + 6), (0 + 1) / (6 + 6), (1 + 1) / (6 + 6), (1 + 1) / (6 + 6), (0 + 1) / (6 + 6) ]]) weights = np.zeros(theta.shape) normed_weights = np.zeros(theta.shape) for i in range(2): weights[i] = -np.log(theta[i]) normed_weights[i] = weights[i] / weights[i].sum() # Verify inputs are nonnegative. clf = ComplementNB(alpha=1.0) assert_raises(ValueError, clf.fit, -X, Y) clf.fit(X, Y) # Check that counts/weights are correct. feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]]) assert_array_equal(clf.feature_count_, feature_count) class_count = np.array([3, 1]) assert_array_equal(clf.class_count_, class_count) feature_all = np.array([1, 4, 1, 1, 1, 1]) assert_array_equal(clf.feature_all_, feature_all) assert_array_almost_equal(clf.feature_log_prob_, weights) clf = ComplementNB(alpha=1.0, norm=True) clf.fit(X, Y) assert_array_almost_equal(clf.feature_log_prob_, normed_weights)
class ProbabilisticValidator(): """ # The probabilistic validator is a quick to train model used for validating the predictions of our main model # It is fit to the results our model gets on the validation set """ _smoothing_factor = 0.5 # TODO: Autodetermine smotthing factor depending on the info we know about the dataset _probabilistic_model = None _X_buff = None _Y_buff = None def __init__(self, col_stats, data_type=None): """ Chose the algorithm to use for the rest of the model As of right now we go with ComplementNB """ self._X_buff = [] self._Y_buff = [] self._predicted_buckets_buff = [] self._real_buckets_buff = [] self.col_stats = col_stats if 'percentage_buckets' in col_stats: self._probabilistic_model = MultinomialNB( alpha=self._smoothing_factor) self.buckets = col_stats['percentage_buckets'] self.bucket_keys = [i for i in range(len(self.buckets))] if len(self.buckets) < 3: self._probabilistic_model = ComplementNB( alpha=self._smoothing_factor) else: self._probabilistic_model = ComplementNB( alpha=self._smoothing_factor) self.buckets = None self.data_type = col_stats['data_type'] self.bucket_accuracy = {} def register_observation(self, features_existence, real_value, predicted_value, hmd=None): """ # Register an observation in the validator's internal buffers :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param real_value: The real value/label for this prediction :param predicted_value: The predicted value/label :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value` """ try: predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float( predicted_value) except: predicted_value = None try: real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float( str(real_value).replace(',', '.')) except: real_value = None if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats, hmd) real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = X + features_existence self._X_buff.append(X) self._Y_buff.append(real_value_b) self._real_buckets_buff = self._Y_buff self._predicted_buckets_buff.append(predicted_value_b) # If no column is ignored, compute the accuracy for this bucket nr_missing_features = len( [x for x in features_existence if x is False or x is 0]) if nr_missing_features == 0: if real_value_b not in self.bucket_accuracy: self.bucket_accuracy[real_value_b] = [] self.bucket_accuracy[real_value_b].append( int(real_value_b == predicted_value_b)) else: predicted_value_b = predicted_value real_value_b = real_value self._X_buff.append(features_existence) self._Y_buff.append(real_value_b == predicted_value_b) self._real_buckets_buff.append(real_value_b) self._predicted_buckets_buff.append(predicted_value_b) def get_accuracy_histogram(self): x = [] y = [] total_correct = 0 total_vals = 0 buckets_with_no_observations = [] for bucket in range(len(self.buckets)): try: total_correct += sum(self.bucket_accuracy[bucket]) total_vals += len(self.bucket_accuracy[bucket]) y.append( sum(self.bucket_accuracy[bucket]) / len(self.bucket_accuracy[bucket])) except: # If no observations were made for this bucket buckets_with_no_observations.append(bucket) y.append(None) x.append(bucket) validation_set_accuracy = total_correct / total_vals for bucket in buckets_with_no_observations: y[x.index(bucket)] = validation_set_accuracy return {'buckets': x, 'accuracies': y}, validation_set_accuracy def partial_fit(self): """ # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet """ log_types = np.seterr() np.seterr(divide='ignore') if self.buckets is not None: self._probabilistic_model.partial_fit(self._X_buff, self._Y_buff, classes=self.bucket_keys) else: self._probabilistic_model.partial_fit(self._X_buff, self._Y_buff, classes=[True, False]) np.seterr(divide=log_types['divide']) self._X_buff = [] self._Y_buff = [] def fit(self): """ # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet """ log_types = np.seterr() np.seterr(divide='ignore') self._probabilistic_model.fit(self._X_buff, self._Y_buff) np.seterr(divide=log_types['divide']) self._X_buff = [] self._Y_buff = [] def get_confusion_matrix(self): matrix = confusion_matrix(self._real_buckets_buff, self._predicted_buckets_buff) return matrix def evaluate_prediction_accuracy(self, features_existence, predicted_value, always_use_model_prediction): """ # Fit the probabilistic validator on an observation def evaluate_prediction_accuracy(self, features_existence, predicted_value): :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param predicted_value: The predicted value/label :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value) """ if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = [X + features_existence] else: X = [features_existence] distribution = self._probabilistic_model.predict_proba(np.array(X))[0] distribution = distribution.tolist() if len([x for x in distribution if x > 0.01]) > 4: # @HACK mean = np.mean(distribution) std = np.std(distribution) distribution = [x if x > (mean - std) else 0 for x in distribution] sum_dist = sum(distribution) distribution = [x / sum_dist for x in distribution] min_val = min([x for x in distribution if x > 0.001]) distribution = [ x - min_val if x > min_val else 0 for x in distribution ] sum_dist = sum(distribution) distribution = [x / sum_dist for x in distribution] # @HACK else: pass return ProbabilityEvaluation(self.buckets, distribution, predicted_value, always_use_model_prediction)
df = df.drop(columns=feat, axis=1) dummy = None # split data to train, heldout, and test datasets print('INFO: Spliting data into train/heldout/test datasets.') x_train = np.array(df[df['data'] == 'T'].drop(columns=['data', 'result'])) y_train = np.array(df[df['data'] == 'T']['result'].astype('bool')) x_valid = np.array(df[df['data'] == 'V'].drop(columns=['data', 'result'])) y_valid = np.array(df[df['data'] == 'V']['result'].astype('bool')) x_hold = np.array(df[df['data'] == 'H'].drop(columns=['data', 'result'])) y_hold = np.array(df[df['data'] == 'H']['result'].astype('bool')) # machine learning classification models classif = [('Gaussian Naive Bayes', GaussianNB()), ('Bernoulli Naive Bayes', BernoulliNB()), ('COmplement Naive Bayes', ComplementNB()), ('Multinomial Naive Bayes', MultinomialNB()), ('LOGistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr', penalty='l2', random_state=24)), ('LOGistic Regression 2', LogisticRegression(solver='saga', multi_class='ovr', l1_ratio=0.3, penalty='elasticnet', max_iter=1000, random_state=24)), ('LOGistic Regression 3', LogisticRegression(solver='saga',
X, y = digits.data, digits.target Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=420) # 定义高斯贝叶斯 gnb = GaussianNB().fit(Xtrain, Ytrain) # 查看分数 acc_score = gnb.score(Xtest, Ytest) # 查看预测结果 Y_pred = gnb.predict(Xtest) # 混淆矩阵 cm = CM(Ytest, Y_pred) # 看高斯贝叶斯适用的数据集 h = .02 names = ["Multinomial", "Gaussian", "Bernoulli", "Complement"] classifiers = [MultinomialNB(), GaussianNB(), BernoulliNB(), ComplementNB()] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] figure = plt.figure(figsize=(6, 9)) i = 1 for ds_index, ds in enumerate(datasets): X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42) x1_min, x1_max = X[:, 0].min() - .5, X[:, 0].max() + .5
class Classifier: def __init__(self, max_df=0.80, max_features=6500): self.count_vect = TfidfVectorizer(max_df=max_df, stop_words='english', max_features=max_features, use_idf=True) self.cnb = ComplementNB() np.random.seed(2222) def __fit(self): self.cnb.fit(self.x_train, self.train_set['category']) # Calling this method just after object creation is required in order to set up data # Attribute test_size specifies the magnitude of the test set def set_data(self, dataset: pd.DataFrame, labels: list, test_size=0.25): self.train_set, self.test_set = train_test_split(dataset, test_size=test_size) self.x_train = self.count_vect.fit_transform(self.train_set['text']) self.labels = labels self.__fit() # This method returns the predicted label for the text provided def predict(self, text: str): txt = TextTools() text = txt.preprocess(text) feats = self.count_vect.transform([text]) return self.cnb.predict(feats) # This method returns a matrix of probabilities computet by Complement Naive Bayes def get_predict_proba(self, text: str): feats = self.count_vect.transform([text]) predictions = { 'label': (self.cnb.predict(feats))[0], 'features': self.cnb.predict_proba(feats) } return predictions # This method returns the f1-score def get_score(self): x_test = self.count_vect.transform(self.test_set['text']) y_test_pred = self.cnb.predict(x_test) return f1_score(self.test_set['category'], y_test_pred, average=None, labels=self.labels).mean() # This method plots the confusion matrix def get_cmatrix(self): x_test = self.count_vect.transform(self.test_set['text']) y_test_pred = self.cnb.predict(x_test) disp = plot_confusion_matrix(self.cnb, x_test, self.test_set['category'], display_labels=self.labels, cmap=plt.cm.Blues, normalize='true') plt.show() # This method computes the cosine similarity between item1 and item2 # item[1,2] must be array-like def similarity(self, item1, item2): return cosine(item1, item2)
def ensemble_all_general(X, y, fold): models = [] num_trees = 150 seed = 7 est1 = SVC(kernel='linear', gamma='auto', C=1.0) est2 = SVC(kernel='rbf', gamma='auto', C=1.0) est3 = GaussianNB() est4 = BernoulliNB() est5 = ComplementNB() est6 = DecisionTreeClassifier() est7 = DecisionTreeClassifier(criterion="entropy") est8 = RandomForestClassifier(n_estimators=50) est9 = KNeighborsClassifier(n_neighbors=6) est10 = BaggingClassifier(base_estimator=est6, n_estimators=num_trees, random_state=seed) est11 = AdaBoostClassifier(n_estimators=50, random_state=seed) est12 = GradientBoostingClassifier(n_estimators=150, random_state=seed) models.append(('SVM-1', est1)) models.append(('SVM-2', est2)) models.append(('NB-1', est3)) models.append(('NB-2', est4)) models.append(('NB-3', est5)) models.append(('DT-1', est6)) models.append(('DT-2', est7)) #models.append(('RF-1', est8)) models.append(('RF-2', RandomForestClassifier())) #models.append(('KNN-1', est9)) models.append(('KNN-2', KNeighborsClassifier())) #models.append(('LDA', LinearDiscriminantAnalysis())) #models.append(('bagging', est10)) #models.append(('adaboost', est11)) #models.append(('gradboost', est12)) #plot_ml_model(models) # evaluate each model in turn #seed = 7 results = [] names = [] scoring = 'accuracy' for name, model in models: #ld = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X, y, cv=fold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # boxplot algorithm comparison fig = pyplot.figure(figsize=(16, 16)) fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(names) pyplot.show()
tee = config.Tee('../Results/%s/CNB_Regroup_%s%s_model.txt' % (config.args.dataset, config.args.tf_idf, config.args.remove_non), 'w') from header_model_data import * from sklearn.naive_bayes import ComplementNB print("Regrouping the labels") for i in range(len(yDF)): if(yDF[i] != pr.y_conversion('new-idea')): yDF[i] = pr.y_conversion('Non') print("The Class Distribution is:") classDist = Counter(yDF) for k in classDist.keys(): print("\t"+str(pr.conversion_y(k))+":"+str(classDist[k])) print("Defining and doing a Complement Naive Bayes classifier for New-idea vs rest") NB = ComplementNB() scores = cross_validate(NB, xDF, yDF, cv=logo, scoring = scorer) #print(scores) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) compute_stats(originalclass, predictedclass, True) print("Confusion Matrix") labelList = list(range(2)) print_cm(confusion_matrix(originalclass, predictedclass, labels = labelList), [pr.conversion_y(x) for x in labelList]) tee.close()