acc = model.score(X_test, y_test) #根据给定数据与标签返回正确率的均值 print('逻辑回归模型评价:', acc) #参数说明: #penalty:使用指定正则化项(默认:l2) #dual: n_samples > n_features取False(默认) #C:正则化强度的反,值越小正则化强度越大 #n_jobs: 指定线程数 #random_state:随机数生成器 #fit_intercept: 是否需要常量 # 朴素贝叶斯 #贝叶斯分类是一类分类算法的总称,这类算法均以贝叶斯定理为基础,故统称为贝叶斯分类。而朴素朴素贝叶斯分类是贝叶斯分类中最简单,也是常见的一种分类方法 model = sk_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) #多项式分布的朴素贝叶斯 model = sk_bayes.BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) #伯努利分布的朴素贝叶斯 model = sk_bayes.GaussianNB() # 高斯分布的朴素贝叶斯 model.fit(X_train, y_train) acc = model.score(X_test, y_test) #根据给定数据与标签返回正确率的均值 print('朴素贝叶斯(高斯分布)模型评价:', acc) #参数说明: #alpha:平滑参数 #fit_prior:是否要学习类的先验概率;false-使用统一的先验概率 #class_prior: 是否指定类的先验概率;若指定则不能根据参数调整 #binarize: 二值化的阈值,若为None,则假设输入由二进制向量组成 #决策树 model = sk_tree.DecisionTreeClassifier(criterion='entropy', max_depth=None, min_samples_leaf=1,
clf_RF,p =model_RF() # model evaluation acc_RF = model_evaluation(clf_RF, y_test) print ('{0:10s} {1:.1f}'.format('AUC Score',auc_score(y_test, p[:,1])*100)) # Accuracy for all Models accuracy_normal=[acc_LR, acc_NB, acc_RF, acc_DT] accuracy_normal=[('{0:2f}'.format(i*100)) for i in accuracy_normal] ############################################################### # 4.5 Cross Validation for all models ############################################################### # Logistic Regression clf1 = LogisticRegression(tol=1e-8, penalty='l2', C=2) # Naive Bayes clf2 = nb.BernoulliNB(alpha=1.0, binarize=0.0) # Decision Tree clf3 = DecisionTreeClassifier(max_depth=100) # Random Forest clf4 = RandomForestClassifier(n_estimators=100) models=[clf1, clf2, clf3, clf4] n_Folds = 10 # Accuracy after cross validation: accuracy_cv = [] for clf in models: accuracy_common = 0 for test_run in range(n_Folds): # (X_train, X_test, y_train, y_test) = train_test_split(X,, test_size=.2) # call classifier
ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis
def vocabvec(word, vocab): vector = [0] * len(vocab) for i in range(len(vocab)): if vocab[i] in word: vector[i] = 1 return vector X = [ 'my dog has flea problems help please', 'maybe not take him to dog park stupid', 'my dalmation is so cute I love him', 'stop posting stupid worthless garbage', 'mr licks ate my steak how to stop him', 'quit buying worthless dog food stupid' ] train_data_Y = [0, 1, 0, 1, 0, 1] words = [get_word(x) for x in X] vocab = get_vocab(words) train_data_X = [vocabvec(word, vocab) for word in words] from sklearn import naive_bayes as nb clf = nb.BernoulliNB() clf.fit(train_data_X, train_data_Y) mail = 'my dog stupid' word = get_word(mail) vector = vocabvec(word, vocab) print clf.predict([vector])
def training(self, train_pct, output_path, output_scaler_path, model_name, properties): K.clear_session() self.graph = tf.get_default_graph() with self.graph.as_default(): self.model_name = model_name df = pd.read_csv(self.fname) df = df.sample(frac=1) df_norm = df # df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1)) # df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1)) new_df = df_norm.sample(frac=1, random_state=self.RSEED) x = new_df.drop('Class', axis=1) y = new_df['Class'] x, tx, y, ty = train_test_split(x, y, train_size=train_pct, stratify=y) train_x = x.values train_y = y.values test_x = tx.values sc = StandardScaler() train_x = sc.fit_transform(train_x) pickle.dump(sc, open(output_scaler_path, "wb")) if model_name == ModelNames.RANDOM_FOREST: rf = RandomForestClassifier(**properties, random_state=self.RSEED) rf.fit(train_x, train_y) pickle.dump(rf, open(output_path, "wb")) self.model = rf elif model_name == ModelNames.LOGISTIC_REGRESSION: lr = LogisticRegression(**properties, random_state=self.RSEED) lr.fit(train_x, train_y) pickle.dump(lr, open(output_path, "wb")) self.model = lr elif model_name == ModelNames.ADAPTIVE_BOOST: ada = AdaBoostClassifier(**properties, random_state=self.RSEED) ada.fit(train_x, train_y) pickle.dump(ada, open(output_path, "wb")) self.model = ada elif model_name == ModelNames.NAIVE_BAYES: nb = naive_bayes.BernoulliNB(**properties) nb.fit(train_x, train_y) pickle.dump(nb, open(output_path, "wb")) self.model = nb elif model_name == ModelNames.SVM: clf = svm.SVC(random_state=self.RSEED, **properties) clf.fit(train_x, train_y) pickle.dump(clf, open(output_path, "wb")) self.model = clf elif model_name == ModelNames.OCSVM: ocsvm = svm.OneClassSVM(**properties) train_x = train_x[train_y == 0] ocsvm.fit(train_x) pickle.dump(ocsvm, open(output_path, "wb")) self.model = ocsvm elif model_name == ModelNames.AUTOENCODED_DEEP_LEARNING: input_dimension = train_x.shape[1] learning_rate = properties.get('learning_rate') encoding_dimension = properties.get('encoding_dimension') if learning_rate is None: learning_rate = 1e-7 else: del properties['learning_rate'] if encoding_dimension is None: encoding_dimension = input_dimension else: del properties['encoding_dimension'] input_layer = Input(shape=(input_dimension, )) Encoder1 = Dense(encoding_dimension, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer) Encoder2 = Dense(int(encoding_dimension/2), activation="relu")(Encoder1) Encoder3 = Dense(int(encoding_dimension/4), activation="tanh")(Encoder2) Decoder1 = Dense(int(encoding_dimension/4), activation="relu")(Encoder3) Decoder2 = Dense(int(encoding_dimension/2), activation="tanh")(Decoder1) Decoder3 = Dense(input_dimension, activation="softmax")(Decoder2) AutoEncoderModel = Model(inputs=input_layer, outputs=Decoder3) AutoEncoderModel.compile(metrics=['accuracy'], loss='mean_squared_error', optimizer='adam') # AutoEncoderModel.compile(metrics=['accuracy'], loss=properties.get('loss'), optimizer=properties.get('optimizer')) cp = ModelCheckpoint(filepath=output_path, save_best_only=True) shuffle = True if self.RSEED is None: shuffle = False history = AutoEncoderModel.fit(train_x, train_x, epochs=properties.get('epochs'), batch_size=properties.get('batch_size'), shuffle=shuffle, verbose=1, callbacks=[cp], validation_data=(test_x, test_x)).history self.model = AutoEncoderModel
classifier.fit(feature_vector_train, label) # predict the labels on validation dataset predictions = classifier.predict(feature_vector_valid) if is_neural_net: predictions = predictions.argmax(axis=-1) return metrics.accuracy_score(predictions, valid_y), confusion_matrix( valid_y, predictions) # In[11]: # Naive Bayes on Ngram Level TF IDF Vectors NBAccuracy = train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) print("NB, N-Gram Vectors: ", NBAccuracy) # In[12]: # SVM SVC on Ngram Level TF IDF Vectors SVCAccuracy = train_model(SVC(kernel='linear'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram) print("SVC, N-gram TF-IDF: ", SVCAccuracy) # In[13]: # SVM NuSVC on Ngram Level TF IDF Vectors NuSVCAccuracy = train_model(NuSVC(kernel='linear', probability=True), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
import pickle from sklearn import naive_bayes with open('../DataSet-Release 1/ds2/ds2Train.csv', 'r') as myFile: ds2Train = [line.split(',') for line in myFile.read().split('\n') ] #[1:]#Remove header, but training set does not have header ds2Train.pop() #print (ds2Train[0]) featuresds2T = [d[:-1] for d in ds2Train] featuresds2T = [[int(x) for x in row] for row in featuresds2T] #Convert chars to int labelsds2T = [d[-1] for d in ds2Train] labelsds2T = [int(x) for x in labelsds2T] #Convert chars to int #Train using training set nBclassifier = naive_bayes.BernoulliNB() nBclassifier.fit(featuresds2T, labelsds2T) #Export trained model with open('naive_bayes_model_ds2.pkl', 'wb') as myFile: pickle.dump(nBclassifier, myFile)
def create_naive_bayes(): model = naive_bayes.BernoulliNB() return model
'occu_ Machine-op-inspct', 'occu_ Other-service', 'occu_ Priv-house-serv', 'occu_ Prof-specialty', 'occu_ Protective-serv', 'occu_ Sales', 'occu_ Tech-support', 'occu_ Transport-moving'] data = data.reindex(columns=reorder_colnames) data = pd.get_dummies(data, columns=['race']) features = ['age', 'fnlwgt', 'work_ Private','work_ Self-emp','work_ Government', 'edunum', 'marital', 'relation', 'sex', 'gain', 'loss', 'hpw', 'country', 'occu_ Adm-clerical', 'occu_ Armed-Forces', 'occu_ Craft-repair', 'occu_ Exec-managerial', 'occu_ Farming-fishing', 'occu_ Handlers-cleaners', 'occu_ Machine-op-inspct', 'occu_ Other-service', 'occu_ Priv-house-serv', 'occu_ Prof-specialty', 'occu_ Protective-serv', 'occu_ Sales', 'occu_ Tech-support', 'occu_ Transport-moving', 'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ White'] y = data['income'] X = data[features] print('Using features: ' + str(features)) # Define the Naive Bayes models gaussianModel = nb.GaussianNB() bernoulliModel = nb.BernoulliNB() multinomialModel = nb.MultinomialNB() complementModel = nb.ComplementNB() # Split training and testing data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Test options and evaluation metric scoring = 'accuracy' # Fit the training sets gaussianModel.fit(X_train, y_train) bernoulliModel.fit(X_train, y_train) multinomialModel.fit(X_train, y_train) complementModel.fit(X_train, y_train)
def document_everything(X_train, X_test, y_train, y_test): Reports = {} Accuracies = {} conf_matrices = {} clf = svm.LinearSVC(C=10) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['SVM'] = clf.score(X_test, y_test) Reports['SVM'] = metrics.classification_report(y_test, y_pred) conf_matrices['SVM'] = metrics.confusion_matrix(y_test, y_pred) print('SVM Done') clf = ensemble.RandomForestClassifier() clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['RF'] = clf.score(X_test, y_test) Reports['RF'] = metrics.classification_report(y_test, y_pred) conf_matrices['RF'] = metrics.confusion_matrix(y_test, y_pred) print('RF Done') clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['DT'] = clf.score(X_test, y_test) Reports['DT'] = metrics.classification_report(y_test, y_pred) conf_matrices['DT'] = metrics.confusion_matrix(y_test, y_pred) print('DT Done') clf = neighbors.KNeighborsClassifier(n_neighbors=1,leaf_size=10) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['KNN'] = clf.score(X_test, y_test) Reports['KNN'] = metrics.classification_report(y_test, y_pred) conf_matrices['KNN'] = metrics.confusion_matrix(y_test, y_pred) print('KNN Done') clf = naive_bayes.MultinomialNB() clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['MNB'] = clf.score(X_test, y_test) Reports['MNB'] = metrics.classification_report(y_test, y_pred) conf_matrices['MNB'] = metrics.confusion_matrix(y_test, y_pred) print('MNB Done') clf = naive_bayes.BernoulliNB(alpha=1e-10) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['BNB'] = clf.score(X_test, y_test) Reports['BNB'] = metrics.classification_report(y_test, y_pred) conf_matrices['BNB'] = metrics.confusion_matrix(y_test, y_pred) print('BNB Done') clf = naive_bayes.ComplementNB() clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['CNB'] = clf.score(X_test, y_test) Reports['CNB'] = metrics.classification_report(y_test, y_pred) conf_matrices['CNB'] = metrics.confusion_matrix(y_test, y_pred) print('CNB Done') clf = neural_network.MLPClassifier() clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) Accuracies['MLP'] = clf.score(X_test, y_test) Reports['MLP'] = metrics.classification_report(y_test, y_pred) conf_matrices['MLP'] = metrics.confusion_matrix(y_test, y_pred) print('CNB Done') for clf in Reports: print(str(clf)) print(str(Accuracies[clf])) print(str(Reports[clf])) print(str(conf_matrices[clf])) size = [conf_matrices[clf][0][0],conf_matrices[clf][0][1],conf_matrices[clf][1][0],conf_matrices[clf][1][1]] labels = 'True Negatives', 'False Positive', 'False Negative', 'True Positives' explode = (0,.1,0,.1) fig1, ax1 = plt.subplots() ax1.pie(size, explode=explode, labels = labels) ax1.axis('equal') plt.show()
def main(): train_df = pd.read_csv("train.csv") test_df = pd.read_csv("test.csv") combine = [train_df, test_df] for df in combine: df.info() standardize_data(df) create_columns(df) create_bins(df) encode_data(df) # Define target (Y variable) target = ["Survived"] # Define features (X variables) train_df_x = [ "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "IsAlone", "Title", ] # Define numerical features (binned and encoded) train_df_x_bin = [ "Pclass", "Sex_Code", "AgeBin_Code", "FareBin_Code", "Embarked_Code", "FamilySize", "IsAlone", "Title_Code", ] # Analyze feature correlation with target for x in train_df_x: if train_df[x].dtype != "float64": print(train_df[[x, target[0]]].groupby(x).mean()) # Graph individual features by survival fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.histplot(x="Fare", data=train_df, hue="Survived", multiple="stack", ax=axis[0]) sns.histplot(x="Age", data=train_df, hue="Survived", multiple="stack", ax=axis[1]) sns.histplot(x="FamilySize", data=train_df, hue="Survived", multiple="stack", ax=axis[2]) fig, axis = plt.subplots(2, 3, figsize=(16, 12)) sns.barplot(x="Pclass", y="Survived", data=train_df, ax=axis[0, 0]) sns.barplot(x="Sex", y="Survived", data=train_df, ax=axis[0, 1]) sns.barplot(x="Embarked", y="Survived", data=train_df, ax=axis[0, 2]) sns.barplot(x="IsAlone", y="Survived", data=train_df, ax=axis[1, 0]) sns.barplot(x="Title", y="Survived", data=train_df, ax=axis[1, 1]) # Compare class with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Sex", ax=axis[0]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Pclass", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Compare Sex with a 2nd feature fig, axis = plt.subplots(1, 3, figsize=(9, 6)) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Pclass", ax=axis[0]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="IsAlone", ax=axis[1]) sns.barplot(x="Sex", y="Survived", data=train_df, hue="Embarked", ax=axis[2]) # Correlation heatmap of dataset fig, ax = plt.subplots(figsize=(14, 12)) fig = sns.heatmap( train_df.corr(), cmap=sns.diverging_palette(240, 10, as_cmap=True), annot=True, ax=ax, ) # Machine Learning Algorithm (MLA) selection and initialization mla = [ linear_model.LogisticRegressionCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(dual=False), neighbors.KNeighborsClassifier(), gaussian_process.GaussianProcessClassifier(), naive_bayes.GaussianNB(), naive_bayes.BernoulliNB(), tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), ensemble.BaggingClassifier(), ensemble.RandomForestClassifier(), ensemble.ExtraTreesClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), ] mla_compare = test_models(mla, train_df, train_df_x_bin, target) best_estimator = optimize_params(mla, mla_compare, train_df, train_df_x_bin, target) generate_submission_csv(test_df, train_df_x_bin, best_estimator)
color = pd.get_dummies(test['color']) test_data = pd.concat([test, color[all_colors]], axis=1) color = pd.get_dummies(train['color']) train_data = pd.concat([train, color[all_colors]], axis=1) train_data = train_data.drop('id', axis=1, inplace=False) test_data = test_data.drop('id', axis=1, inplace=False) # Asert that test set has only the 'type' column left train_data.columns - test_data.columns == ['type'] # MODELS # Naive Bayes nb = {'name': 'Bernoulli NaiveBayes'} nb['model'] = naive_bayes.BernoulliNB() # Logistic Regression lr = {'name': 'Logistic Regression'} lr['model'] = linear_model.LogisticRegression(solver='lbfgs') # Logistic Regression with CV lrcv = {'name': 'Cross-Validated Logistic Regression'} lrcv['model'] = linear_model.LogisticRegressionCV(Cs=100, solver='lbfgs', n_jobs=-1) # SVC svc = {'name': 'Support Vector Machine'} svc['model'] = OneVsRestClassifier(svm.SVC(kernel='rbf', probability=True))
X_test = pddtest.values[:, selection_feature_index[0:selection]] y_test = list(pddtest.iloc[:, 0]) # #------------------------------------------------------------------------------------------------------ scaler = preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # ##模型选择 分类器参数选择--------------------------------------------------------------------------------- RF = RandomForestClassifier(n_estimators=320, criterion='gini', max_depth=8 , min_samples_split=12, min_samples_leaf=3, min_weight_fraction_leaf=0.06, max_features='auto' , max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True , oob_score=False, n_jobs=None, random_state=0, verbose=0, warm_start=False, class_weight=None) knn = KNeighborsClassifier(n_neighbors=40, n_jobs=1) # K临近算法 naiveB = naive_bayes.BernoulliNB(alpha=1.6, binarize=1.41, fit_prior=True, class_prior=None) # 0.575 svm = SVC(C=100, kernel='rbf', gamma=0.01, probability=True) LR = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=4.0 , fit_intercept=True, intercept_scaling=2, class_weight=None , random_state=None, solver='liblinear', max_iter=100, multi_class='ovr' , verbose=0, warm_start=False, n_jobs=1) eclf = VotingClassifier(estimators=[('RF', RF), ('knn', knn), ('naiveB', naiveB), ('svm', svm), ('LR', LR)], voting='soft', weights=[1, 1, 1, 1, 1]) # # #训练模型--------------------------------------------------------------------------------------------- RF.fit(X_train, y_train) knn.fit(X_train, y_train) naiveB.fit(X_train, y_train) svm.fit(X_train, y_train) eclf.fit(X_train, y_train) LR.fit(X_train, y_train)
def runBer(train_X, train_y, test_X, test_y, test_X2): model = naive_bayes.BernoulliNB() model.fit(train_X, train_y) pred_test_y = model.predict_proba(test_X) pred_test_y2 = model.predict_proba(test_X2) return pred_test_y, pred_test_y2, model
binary=True, ngram_range=NGRAM_RANGE) #vectorizer_binary = CountVectorizer(stop_words=stopWords, min_df=MIN_DF, binary=True, ngram_range=(1, 2)) #vectorizer_binary = CountVectorizer(stop_words=stopWords, min_df=MIN_DF, binary=True, ngram_range=(1, 3)) vectorizer_tfidf = TfidfVectorizer(stop_words=stopWords, min_df=MIN_DF, ngram_range=NGRAM_RANGE) #vectorizer = vectorizer_tfidf vectorizer = vectorizer_binary print(vectorizer) #clf = linear_model.LogisticRegression(penalty='l2', C=1.2) _ = linear_model.LogisticRegression() _ = svm.LinearSVC() _ = naive_bayes.BernoulliNB( ) # useful for binary inputs (MultinomialNB is useful for counts) _ = naive_bayes.GaussianNB() _ = naive_bayes.MultinomialNB() _ = ensemble.AdaBoostClassifier(n_estimators=100, base_estimator=tree.DecisionTreeClassifier( max_depth=2, criterion='entropy')) #clf = ensemble.AdaBoostClassifier(n_estimators=100, base_estimator=tree.DecisionTreeClassifier(max_depth=2)) _ = tree.DecisionTreeClassifier(max_depth=50, min_samples_leaf=5) #clf = tree.DecisionTreeClassifier(max_depth=2, min_samples_leaf=5, criterion='entropy') #clf = ensemble.RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_estimators=10, oob_score=False, n_jobs=-1, criterion='entropy') _ = ensemble.RandomForestClassifier(max_depth=10, min_samples_leaf=5, n_estimators=50, n_jobs=-1, criterion='entropy') #clf = ensemble.RandomForestClassifier(max_depth=30, min_samples_leaf=5, n_estimators=100, oob_score=True, n_jobs=-1)
print("----------K-NEAREST NEIGHBOR-------------------") # Build a sequence of models for k = 2, 4, 6, 8, ..., 20. ks = [16] for k in ks: # Create model and fit. mod_knn = neighbors.KNeighborsClassifier(n_neighbors=k) mod_knn.fit(x_train, y_train) # Make predictions - both class labels and predicted probabilities. predsknn = mod_knn.predict(x_test) print(' EVALUATING MODEL: k = ' + str(k)) # Look at results. print_binary_classif_error_report(y_test, predsknn) print("\n----------Bernoulli - NAIVE BAYESIAN-------------------") bnb_mod = naive_bayes.BernoulliNB() bnb_mod.fit(x_train, y_train) predsbnb = bnb_mod.predict(x_test) print_binary_classif_error_report(y_test, predsbnb) print("\n----------DECISION TREE-------------------") print("DTREE WITH GINI IMPURITY CRITERION:") dtree_gini_mod = tree.DecisionTreeClassifier(criterion='gini') dtree_gini_mod.fit(x_train, y_train) preds_gini = dtree_gini_mod.predict(x_test) print_binary_classif_error_report(y_test, preds_gini) print("\n----------SUPPORT VECTOR MACHINE-------------------") # Build a sequence of models for different n_est and depth values. cs = [2.0] for c in cs:
# Naive Bayes print("Naive Bayes với tfidf") train_model(naive_bayes.MultinomialNB(), X_data_tfidf, y_data_n, X_test_tfidf, y_test_n, is_neuralnet=False) ########### kết quả Naive Bayes với tfidf: # Validation accuracy: 0.7211404728789986 # Test accuracy: 0.7024677045379265 print("BernoulliNB với tfidf") train_model(naive_bayes.BernoulliNB(), X_data_tfidf, y_data_n, X_test_tfidf, y_test_n, is_neuralnet=False) ########### kết quả BernoulliNB với tfidf # Validation accuracy: 0.760778859527121 # Test accuracy: 0.7232527326929447 print("BernoulliNB với tfidf SVD") train_model(naive_bayes.BernoulliNB(), X_data_tfidf_svd, y_data_n, X_test_tfidf_svd, y_test_n,
data.extend(ham_processed_texts) data.extend(spam_processed_texts) textMatrix = transformTextToSparseMatrix(data) textMatrix.head() features = pd.DataFrame(textMatrix.apply(sum,axis=0)) extractedfeatures = [features.index[i] for i in range(features.shape[0]) if features.iloc[i,0]>5] textMatrix = textMatrix[extractedfeatures] textMatrix = textMatrix[extractedfeatures] labels = [] labels.extend(ones(5000)) labels.extend(zeros(5001)) # split into a train and a validation data set train,test,trainlabel,testlabel = train_test_split(textMatrix,labels,test_size=0.1) # bayes clf = bayes.BernoulliNB(alpha=1,binarize=True) model = clf.fit(train,trainlabel) # SVM model2 = LinearSVC() model2.fit(train,trainlabel) print(model.score(test,testlabel)) print(model2.score(test,testlabel)) 0.922077922077922 0.987012987012987 import matplotlib.pyplot as plt from pylab import * #to support Chinese mpl.rcParams['font.sans-serif'] = ['SimHei'] names = ['1000', '2000', '4000', '6000', '8000','9000']
test_data = pd.concat([hour, days, year, block, district, xs, ys], axis=1) del days, district, hour, month, year, block, xs, ys, crime # MODELS ======================================================================================= """ Choose the model you want, fit it with the data, then you can perform some bagging or some boosting on the base model you have. """ # Build up the features features = list(train_data.columns[:-1]) # Base-Model construction begin = time.time() baseModel = naive_bayes.BernoulliNB() baseModel.fit(train_data[features], train_data['Crime']) showExecTime(begin, "BernoulliNB fitted.") # Logistic Regression begin = time.time() lr = linear_model.LogisticRegression(solver='lbfgs') lr.fit(train_data[features], train_data['Crime']) showExecTime(begin, "LogisticRegression fitted.") # Random Forest begin = time.time() randomForest = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1, min_samples_leaf=50) randomForest.fit(train_data[features], train_data['Crime'])
data.extend(ham_mails) data.extend(spam_mails) textMatrix = transformTextToSparseMatrix(data) # textMatrix.head() #展示一下这个稀疏矩阵 features = pd.DataFrame(textMatrix.apply(sum, axis=0)) print(features) #展示总词频数 #抽取出在至少5封邮件都出现的词来作为特征,以减少特征维度 extractedfeatures = [ features.index[i] for i in range(features.shape[0]) if features.iloc[i, 0] > 5 ] #得到的是一个下标序列 textMatrix = textMatrix[extractedfeatures] textMatrix = textMatrix[extractedfeatures] labels = [] labels.extend(ones(len(ham_mails))) labels.extend(zeros(len(spam_mails))) # 划分训练集和测试集 train, test, trainlabel, testlabel = train_test_split(textMatrix, labels, test_size=0.3) BYM_lb = bayes.BernoulliNB(alpha=1, binarize=True) #sklearn的朴素贝叶斯分类器 model = BYM_lb.fit(train, trainlabel) # 调用贝叶斯库函数求解模型 print(train.shape) print(len(trainlabel)) print(trainlabel[:int(sqrt(len(trainlabel)))]) print("识别准确率=", model.score(test, testlabel))
# score_test = dt.score(test_col, test_ans) # print('testing acc:',score_test) #naive bayes par_smooth = 0.003 # for i in range(10): NB1 = naive_bayes.GaussianNB(var_smoothing=par_smooth) NB1.fit(x_train, y_train) score_train = NB1.score(x_train, y_train) print('Gaussian NB training acc:', score_train) # score_test = NB1.score(test_col, test_ans) # print('Gaussian NB testing acc:',score_test) # par_smooth=par_smooth+0.001 NB2 = naive_bayes.BernoulliNB(alpha=0.1, fit_prior=False) NB2.fit(x_train, y_train) score_train = NB2.score(x_train, y_train) print('Bernoulli NB training acc:', score_train) # score_test = NB2.score(test_col, test_ans) # print('Bernoulli NB testing acc:',score_test) NB3 = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=False) NB3.fit(x_train, y_train) score_train = NB3.score(x_train, y_train) print('Multinomial NB training acc:', score_train) # score_test = NB3.score(test_col, test_ans) # print('Multinomial NB testing acc:',score_test) multi_pred = NB3.predict(test_col) if sys.argv[1] == 'D':
def retornaModeloTreinado(model, validacao): if validacao == 'aprovado': X = train_data_aprovado y = target_data_aprovado elif validacao == 'evasao': X = train_data_evasao y = target_data_evasao return model.fit(X, y) # escolhidos para aprovado/reprovado logistic_regression_aprovado = LogisticRegression(penalty='l1') svm_aprovado = svm.SVC() knn_aprovado = neighbors.KNeighborsClassifier(weights='distance') naive_bayes_aprovado = naive_bayes.BernoulliNB() # escolhidos para evasao logistic_regression_evasao = LogisticRegression(solver='newton-cg') svm_evasao = svm.SVC() knn_evasao = neighbors.KNeighborsRegressor(n_neighbors=7) naive_bayes_evasao = naive_bayes.BernoulliNB() # salvando os modelos treinados joblib.dump(retornaModeloTreinado(knn_aprovado, 'aprovado'), 'model/model_si_aprovado_knn.pkl') joblib.dump(retornaModeloTreinado(knn_evasao, 'evasao'), 'model/model_si_evasao_knn.pkl') print('ok')
body_title_corpus.append(np.concatenate((corpus_data_features_nd[i], corpus_data_features_nd1[i]))) body_title_corpus = array(body_title_corpus) for i in range(len(corpus_data_features_nd)): body_title_tags_corpus.append(np.concatenate((body_title_corpus[i], corpus_data_features_nd2[i]))) body_title_tags_corpus = array(body_title_tags_corpus) print body_title_tags_corpus X_train, X_test, y_train, y_test = train_test_split(body_title_tags_corpus[0:len(train_data_df)], train_data_df.Popularity, random_state=2) print body_title_tags_corpus.shape nb_model = naive_bayes.BernoulliNB() #nb_model = naive_bayes.MultinomialNB() #nb_model = naive_bayes.GaussianNB() nb_model = nb_model.fit(X=X_train, y=y_train) y_pred = nb_model.predict(X_test) # get predictions accu_score = cross_val_score(nb_model,X_test,y_pred,cv=10,scoring='accuracy').mean() print "\n" print "accuracy score : ",accu_score precision_score = cross_val_score(nb_model,X_test,y_pred,cv=10,scoring='precision').mean() #print "\n"
def bayes_train(type, ratio): print "begin traning bayes..." with open(dump_file) as f: x = cPickle.load(f) y = cPickle.load(f) idx = cPickle.load(f) x = x.tocsr() y = np.array(y.todense()).ravel() idx = np.array(idx.todense()).ravel() print "load data complete" x = preprocessing.normalize(x) x, y, idx = shuffle(x, y, idx, random_state=42) if type == "m": clf = naive_bayes.MultinomialNB() else: clf = naive_bayes.BernoulliNB() train_len = int(0.8 * ratio * y.shape[0]) test_len = ratio * y.shape[0] - train_len train_x = x[:train_len] test_x = x[train_len:ratio * y.shape[0]] train_y = y[:train_len] test_y = y[train_len:ratio * y.shape[0]] test_idx = idx[train_len:ratio * y.shape[0]] clf.fit(train_x, train_y) pred_y = clf.predict(test_x) acc = accuracy_score(test_y, pred_y) score = np.empty([3, len(tags)], dtype=float) score[0], score[1], score[2], support = precision_recall_fscore_support( test_y, pred_y) macro = np.mean(score, axis=1) micro = np.mean(score * support, axis=1) / np.mean(support) f = open("error_log.txt", "w") for i in range(test_y.shape[0]): if test_y[i] == pred_y[i]: continue sql = "select url, comment from car.info where id = %d" % test_idx[i] cur.execute(sql) url, comment = cur.fetchone() info = "\npredict: %s\ntrue: %s\n%s\n%s\n" % ( tags[pred_y[i]], tags[test_y[i]], url, comment) f.write(info) f.close() # with open("result.txt", "a") as f: # info = "\nBayes with %s and num %d: \n" % (type, train_len) # print info # f.write(info) # f.write(str(acc)) print acc print score[0] print macro[0], micro[0] print score[1] print macro[1], micro[1] print score[2] print macro[2], micro[2] print support
ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), # Gaussian Processes gaussian_process.GaussianProcessClassifier(), # Generalized Linear Models linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), # Navies Bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(), # Nearest Neighbor neighbors.KNeighborsClassifier(), # Support Vector Machine svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), # Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), # Discriminant Analysis
def main_3(): x_data = datasets.load_boston().data naive_bayes.BernoulliNB()
ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #Gaussian Processes #gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.LogisticRegression(C=100, random_state=0, solver='liblinear'), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(), #naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(), #SVM svm.SVC(probability=True), svm.NuSVC(probability=True), svm.LinearSVC(), #Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), #Discriminant Analysis
ensemble.ExtraTreesClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, random_state= 0), ensemble.GradientBoostingClassifier(learning_rate= 0.05, max_depth= 2, n_estimators= 300, random_state= 0), ensemble.RandomForestClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, oob_score= True, random_state= 0), #Gaussian Processes gaussian_process.GaussianProcessClassifier(max_iter_predict= 10, random_state= 0), #GLM linear_model.LogisticRegressionCV(fit_intercept= True, random_state= 0, solver= 'liblinear'), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #Navies Bayes naive_bayes.BernoulliNB(alpha= 0.1), naive_bayes.GaussianNB(), #Nearest Neighbor neighbors.KNeighborsClassifier(algorithm= 'brute', n_neighbors= 7, weights= 'uniform'), #SVM svm.SVC(C= 2, decision_function_shape= 'ovo', gamma= 0.1, probability= True, random_state= 0), svm.NuSVC(probability=True), svm.LinearSVC(), #Trees tree.DecisionTreeClassifier(), tree.ExtraTreeClassifier(), #Discriminant Analysis
from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import chi2 from sklearn.feature_selection import SelectKBest kbestfilter = SelectKBest(chi2,k=500) train_features = kbestfilter.fit_transform(dataset_small.get_train_features(), dataset_small.get_train_labels()) test_features = kbestfilter.transform(dataset_small.get_test_features()) ## threshold = 0.8*(1-0.8) sel_var = VarianceThreshold(threshold = threshold) sel_var.fit(np.sign(dataset_small.get_train_features())) train_selected_features = sel_var.transform(dataset_small.get_train_features()) test_selected_features = sel_var.transform(dataset_small.get_test_features()) ## train naive bayes import sklearn.naive_bayes as naive_bayes bnb = naive_bayes.BernoulliNB() spam_filter = bnb.fit(np.sign(train_selected_features), dataset_small.get_train_labels()) spam_pred = spam_filter.predict(test_selected_features) ## evaluate goodness of prediction import sklearn.metrics report = sklearn.metrics.classification_report(dataset_small.get_test_labels(), spam_pred)
def autoTuning(X, y): cv_split = model_selection.ShuffleSplit(n_splits=10, test_size=.3, train_size=.6, random_state=0) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html vote_est = [ #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html ('ada', ensemble.AdaBoostClassifier()), ('bc', ensemble.BaggingClassifier()), ('etc', ensemble.ExtraTreesClassifier()), ('gbc', ensemble.GradientBoostingClassifier()), ('rfc', ensemble.RandomForestClassifier()), #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc ('gpc', gaussian_process.GaussianProcessClassifier()), #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression ('lr', linear_model.LogisticRegressionCV()), #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html ('bnb', naive_bayes.BernoulliNB()), ('gnb', naive_bayes.GaussianNB()), #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html ('knn', neighbors.KNeighborsClassifier()), #SVM: http://scikit-learn.org/stable/modules/svm.html ('svc', svm.SVC(probability=True)), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html ('xgb', XGBClassifier()) ] #Hard Vote or majority rules vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard') vote_hard_cv = model_selection.cross_validate(vote_hard, X, y, cv=cv_split) vote_hard.fit(X, y) print("Hard Voting Training w/bin score mean: {:.2f}".format( vote_hard_cv['train_score'].mean() * 100)) print("Hard Voting Test w/bin score mean: {:.2f}".format( vote_hard_cv['test_score'].mean() * 100)) print("Hard Voting Test w/bin score 3*std: +/- {:.2f}".format( vote_hard_cv['test_score'].std() * 100 * 3)) print('-' * 10) #Soft Vote or weighted probabilities vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft') vote_soft_cv = model_selection.cross_validate(vote_soft, X, y, cv=cv_split) vote_soft.fit(X, y) print("Soft Voting Training w/bin score mean: {:.2f}".format( vote_soft_cv['train_score'].mean() * 100)) print("Soft Voting Test w/bin score mean: {:.2f}".format( vote_soft_cv['test_score'].mean() * 100)) print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format( vote_soft_cv['test_score'].std() * 100 * 3)) print('-' * 10) #Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html grid_n_estimator = range(10, 300, 50) grid_ratio = [.1, .25, .5, .75, 1.0] grid_learn = [.01, .03, .05, .75, .1, .15, .25] grid_max_depth = [2, 3, 4, 5, 6, 7, None] grid_min_samples = [5, 10, .03, .05, .10] grid_criterion = ['gini', 'entropy'] grid_bool = [True, False] grid_seed = [0] grid_param = [ [{ #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html 'n_estimators': grid_n_estimator, #default=50 'learning_rate': grid_learn, #default=1 #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R 'random_state': grid_seed }], [{ #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier 'n_estimators': grid_n_estimator, #default=10 'max_samples': grid_ratio, #default=1.0 'random_state': grid_seed }], [{ #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier 'n_estimators': grid_n_estimator, #default=10 'criterion': grid_criterion, #default=”gini” 'max_depth': grid_max_depth, #default=None 'random_state': grid_seed }], [{ #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier #'loss': ['deviance', 'exponential'], #default=’deviance’ 'learning_rate': [ .05 ], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds. 'n_estimators': [ 300 ], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds. #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse” 'max_depth': grid_max_depth, #default=3 'random_state': grid_seed }], [{ #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier 'n_estimators': grid_n_estimator, #default=10 'criterion': grid_criterion, #default=”gini” 'max_depth': grid_max_depth, #default=None 'oob_score': [ True ], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds. 'random_state': grid_seed }], [{ #GaussianProcessClassifier 'max_iter_predict': grid_n_estimator, #default: 100 'random_state': grid_seed }], [{ #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV 'fit_intercept': grid_bool, #default: True #'penalty': ['l1','l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs 'random_state': grid_seed }], [{ #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB 'alpha': grid_ratio, #default: 1.0 }], #GaussianNB - [{}], [{ #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier 'n_neighbors': [6, 7, 8, 9, 10, 11, 12, 14, 16, 18, 20, 22], #default: 5 'weights': ['uniform', 'distance'], #default = ‘uniform’ 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': list(range(1, 50, 5)) }], [{ #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 2, 3, 4, 5], #default=1.0 'gamma': grid_ratio, #edfault: auto 'decision_function_shape': ['ovo', 'ovr'], #default:ovr 'probability': [True], 'random_state': grid_seed }], [{ #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html 'learning_rate': grid_learn, #default: .3 'max_depth': [1, 2, 4, 6, 8, 10], #default 2 'n_estimators': grid_n_estimator, 'seed': grid_seed }] ] start_total = time.perf_counter( ) #https://docs.python.org/3/library/time.html#time.perf_counter for clf, param in zip( vote_est, grid_param): #https://docs.python.org/3/library/functions.html#zip #print(clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm #print(param) start = time.perf_counter() best_search = model_selection.GridSearchCV(estimator=clf[1], param_grid=param, cv=cv_split, scoring='roc_auc', n_jobs=-1) best_search.fit(X, y) run = time.perf_counter() - start best_param = best_search.best_params_ print( 'The best {} parameter for {} is {} with a runtime of {:.2f} seconds.' .format(best_search.best_score_, clf[1].__class__.__name__, best_param, run)) clf[1].set_params(**best_param) run_total = time.perf_counter() - start_total print('Total optimization time was {:.2f} minutes.'.format(run_total / 60)) print('-' * 10) #%% [markdown] # # Submission #%% #Hard Vote or majority rules w/Tuned Hyperparameters grid_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard') grid_hard_cv = model_selection.cross_validate(grid_hard, X, y, cv=cv_split) grid_hard.fit(X, y) print( "Hard Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}" .format(grid_hard_cv['train_score'].mean() * 100)) print("Hard Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}". format(grid_hard_cv['test_score'].mean() * 100)) print( "Hard Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}" .format(grid_hard_cv['test_score'].std() * 100 * 3)) print('-' * 10) #Soft Vote or weighted probabilities w/Tuned Hyperparameters grid_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft') grid_soft_cv = model_selection.cross_validate(grid_soft, X, y, cv=cv_split) grid_soft.fit(X, y) print( "Soft Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}" .format(grid_soft_cv['train_score'].mean() * 100)) print("Soft Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}". format(grid_soft_cv['test_score'].mean() * 100)) print( "Soft Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}" .format(grid_soft_cv['test_score'].std() * 100 * 3)) print('-' * 10)