import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from model import DataManager, Classifier, PlotGenerator REPETITION = 50 probes, result = DataManager.load_data('data/diabetes.arff') x = np.array(probes) y = np.array(result) bayes = Classifier(MultinomialNB(), REPETITION, x, y) bayes.calculate_indicators() logistic_regression = Classifier(LogisticRegression(), REPETITION, x, y) logistic_regression.calculate_indicators() kneighbours_classifier = Classifier(KNeighborsClassifier(10), REPETITION, x, y) kneighbours_classifier.calculate_indicators() mlp_classifier = Classifier(MLPClassifier(), REPETITION, x, y) mlp_classifier.calculate_indicators() classifiers_array = [] classifiers_array.append(bayes) classifiers_array.append(logistic_regression) classifiers_array.append(kneighbours_classifier)
alphaList = [0.000001, 0.01, 0.1, 1, 5, 10, 20, 1000] for alpha in alphaList: print('ALPHA: {}'.format(alpha)) from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=dim, max_iter=5, learning_method='online', learning_offset=50., doc_topic_prior=alpha, random_state=0) lda.fit(tf_train) trainTopicDistArr = lda.transform(tf_train) testTopicDistArr = lda.transform(tf_test) #%% 5. Classification using Naive Bayes # Train Model from sklearn.naive_bayes import MultinomialNB nb_LDA = MultinomialNB().fit(trainTopicDistArr, trainY) # Print Training Accuracy from sklearn.cross_validation import cross_val_score print( "(CV, LDA): ", cross_val_score(MultinomialNB(), trainTopicDistArr, trainY, cv=3, scoring="accuracy").mean()) # Print Test Accuracy print('(TE, LDA): ' + str(nb_LDA.score(testTopicDistArr, testY)))
salary_train[i] = number.fit_transform(salary_train[i]) salary_test[i] = number.fit_transform(salary_test[i]) colnames = salary_train.columns len(colnames[0:13]) trainX = salary_train[colnames[0:13]] trainY = salary_train[colnames[13]] testX = salary_test[colnames[0:13]] testY = salary_test[colnames[13]] from sklearn.naive_bayes import MultinomialNB as MB from sklearn.naive_bayes import GaussianNB as GB sgnb = GaussianNB() smnb = MultinomialNB() spred_gnb = sgnb.fit(trainX, trainY).predict(testX) confusion_matrix(testY, spred_gnb) print("Accuracy", (10759 + 1209) / (10759 + 601 + 2491 + 1209)) # 80% spred_mnb = smnb.fit(trainX, trainY).predict(testX) confusion_matrix(testY, spred_mnb) print("Accuracy", (10891 + 780) / (10891 + 780 + 2920 + 780)) # 75% # Stratified Method from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score metric_names = [ 'f1', 'roc_auc', 'average_precision', 'accuracy', 'precision', 'recall' ] scores_df = pd.DataFrame(index=metric_names, columns=['Random-CV',
for i,d in enumerate(dictionary): if d[0] == word: wordID = i features_matrix[docID,wordID] = words.count(word) docID = docID + 1 return features_matrix train_dir = r'C:\\Users\\USERONE\\Desktop\\ling-spam\\train-mails' dictionary = make_Dictionary(train_dir) print (dictionary) train_labels = np.zeros(702) train_labels[351:701] = 1 #spam emails train_matrix = extract_features(train_dir) print(train_matrix[1]) # Training SVM and Naive bayes classifier NB_model = MultinomialNB() #GNB_model = GaussianNB() #BNB_model = BernoulliNB() LinearSVM_model = LinearSVC() SVM_model = SVC() NB_model.fit(train_matrix,train_labels) #GNB_model.fit(train_matrix,train_labels) #BNB_model.fit(train_matrix,train_labels) LinearSVM_model.fit(train_matrix,train_labels) SVM_model.fit(train_matrix,train_labels) #use GridSearch to better choose tuning parameters C and gamma param_grid = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]} grid = GridSearchCV(SVC(), param_grid, verbose=3) grid.fit(train_matrix,train_labels)
def predictEmotion(data, newRec): res = {} perf_score = {} df = prepData(data) text_hl_sum = df['headline'] + ' ' + df['summary'] processedRec = prepNewRec(newRec) # create transformer vectorizer = CountVectorizer() encoder = LabelEncoder() # tokenize and build vocabulary_ vectorizer.fit(text_hl_sum) # encode document X = vectorizer.transform(text_hl_sum) #print('training data - transformed matrix shape: ', X.shape) vect_new = vectorizer.transform(processedRec) #print('new records - transformed matrix shape: ', vect_new.shape) #print() for i in range(10): emotion = 'emotion_'+str(i) y = df[emotion] #print (X.shape[0], len(y)) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # resample data set to increase minority calss #resample = SMOTE() resample = RandomOverSampler() X_train_new, y_train_new = resample.fit_sample(X_train, y_train) clf_pipe = Pipeline([('tfidf', TfidfTransformer()), ('mnb', MultinomialNB())]) tuned_parameters = { 'tfidf__norm': ('l1', 'l2'), 'tfidf__use_idf': (False, True), 'mnb__alpha': [1, 0.1, 0.01] } np.errstate(divide='ignore') clf = GridSearchCV(clf_pipe, tuned_parameters, cv=10, scoring=score) clf.fit(X_train_new, y_train_new) perf_score[emotion] = clf.best_score_ print() print('~~~~~~~~~~~~~~~~~ %s ~~~~~~~~~~~~~~~' % emotion) print() print('Best score: %0.4f with parameters %r' % (clf.best_score_, clf.best_params_)) print() print('Detailed model performance score with parameters to correctly predict the results:') for mean, std, params in zip(clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], clf.cv_results_['params']): print('%0.4f +/-%0.04f with parameters %r' % (mean, std * 2, params)) print() print("Detailed classification report (scores were computed on evaluation data set):") print() print(classification_report(y_test, clf.predict(X_test), digits=4)) print() ####### predict the emotion for new headline and summary pred = clf.predict(vect_new) #print (emotion, pred) res[emotion] = int(pred[0]) return res
# In[7]: # TF-IDF from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_tfidf.shape print("\n\nTarget: >> ", twenty_train.target) # In[9]: # Machine Learning # Training Naive Bayes (NB) classifier on training data. from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) # In[14]: # Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows: # The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later. # We will be using the 'text_clf' going forward. from sklearn.pipeline import Pipeline text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) text_clf = text_clf.fit(twenty_train.data, twenty_train.target) # In[15]:
print('Dummy classifier, который будет всем новым наблюдениям присваивать класс ham, получит 75% precission и 87 - recall, 80 - f-score') # print('\nNaive Bayes 1') # naive_model = MultinomialNB() # naive_model.fit(bowed_messages, messages['label']) # # print(len(msg_train), len(msg_test)) # cv_results = cross_val_score(naive_model, bowed_messages, messages['label'], cv=10, scoring='accuracy') # print(cv_results.mean(), cv_results.std()) # print(classification_report(messages['label'], naive_model.predict(bowed_messages))) msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size=0.2) # поделить выборку в соотновении 80:20 # Первая токенизация, Байес print('\n1) Naive Bayes, tokenize 1') pipeline, label_predicted = do_smth_with_model(steps=[('bow', CountVectorizer(analyzer=tokenize)), ('classifier', MultinomialNB())]) draw_learning_curve(pipeline) draw_roc_curve(label_predicted) print('Судя по roc-curve, классификатор показывает высокие результаты, AUC-value очень высокий, roc-curve почти параллельна оси х') print('Learning curve показывает, что при увеличении обучающих данных, cross-validation score может незначительно ' 'улучшиться, training score при этом останется статичен') # Вторая токенизация, Байес print('\n1) Naive Bayes tokenize 2') do_smth_with_model(steps=[('bow', CountVectorizer(analyzer=tokenize2)), ('classifier', MultinomialNB())]) # Первая токенизация, Байес, удаляем стоп слова print('\n3) Naive Bayes удаляем стоп слова')
dictionary = make_Dictionary(root_dir) # Prepare feature vectors per training mail and its labels features_matrix, labels = extract_features(root_dir) np.save('enron_features_matrix.npy', features_matrix) np.save('enron_labels.npy', labels) # train_matrix = np.load('enron_features_matrix.npy'); # labels = np.load('enron_labels.npy'); print(features_matrix.shape) print(labels.shape) print(sum(labels == 0), sum(labels == 1)) X_train, X_test, y_train, y_test = train_test_split(features_matrix, labels, test_size=0.40) ## Training models and its variants model1 = LinearSVC() model2 = MultinomialNB() model1.fit(X_train, y_train) model2.fit(X_train, y_train) result1 = model1.predict(X_test) result2 = model2.predict(X_test) print(confusion_matrix(y_test, result1)) print(confusion_matrix(y_test, result2))
# Thanks at http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html print('_' * 80) print("Training: ") print(model) t0 = time() model.fit(dTr['dffi'], dTr['target']) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = model.predict(dTe['dffi']) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(dTe['target'], pred) print("accuracy: %0.3f" % score) model_desc = str(model).split('(')[0] print("confusion matrix:") print(metrics.confusion_matrix(dTe['target'], pred)) return model_desc, score, train_time, test_time results = [] # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01),dTest,dTrain)) results.append(benchmark(BernoulliNB(alpha=.01),dTest,dTrain))
count_v0 = CountVectorizer() counts_all = count_v0.fit_transform(all_text) count_v1 = CountVectorizer(vocabulary=count_v0.vocabulary_) counts_train = count_v1.fit_transform(train_texts) print "the shape of train is " + repr(counts_train.shape) count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_) counts_test = count_v2.fit_transform(test_texts) print "the shape of test is " + repr(counts_test.shape) tfidftransformer = TfidfTransformer() train_data = tfidftransformer.fit(counts_train).transform(counts_train) test_data = tfidftransformer.fit(counts_test).transform(counts_test) x_train = train_data y_train = train_labels x_test = test_data y_test = test_labels print '(3) Naive Bayes...' from sklearn.naive_bayes import MultinomialNB from sklearn import metrics clf = MultinomialNB(alpha=0.01) clf.fit(x_train, y_train) preds = clf.predict(x_test) num = 0 preds = preds.tolist() for i, pred in enumerate(preds): if int(pred) == int(y_test[i]): num += 1 print 'precision_score:' + str(float(num) / len(preds))
def main(): # The file path where to find the data path_folder = "data/" # Opening metadata meta_data = pd.read_csv(path_folder + "Tobacco3482.csv") # Here I'm extracting the labels labels = np.unique(meta_data["label"]) # Opening the data x = [] y = [] label_classes = {} i = 0 for label in labels: path = path_folder + label + "/*.txt" print("Opening " + label + " data") files = glob.glob(path) for file in files: file_tmp = open(file, 'r') x.append(file_tmp.read()) y.append(label) file_tmp.close() label_classes[i] = label i += 1 print("Opened " + str(len(x)) + " documents, " + str(len(np.unique(y))) + " different classes") # Here I'm extracting the label labels = np.unique(meta_data["label"]) # Treating the labels label_encoder = preprocessing.LabelEncoder() y = label_encoder.fit_transform(y) # Splitting the data into train and test x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42) # Transforming the data into token representation vectorizer = CountVectorizer() vectorizer.fit(x_train) x_train_counts = vectorizer.transform(x_train) x_test_counts = vectorizer.transform(x_test) # Bayesian part # Creation of the model clf = MultinomialNB() print("Training Bayesian for baseline") # Training clf.fit(x_train_counts, y_train) print("Printing results for Bayesian") # Printing of the results print("Accuracy score : ") print(clf.score(x_test_counts, y_test)) y_pred = clf.predict(x_test_counts) print("Confusion matrix :") print(confusion_matrix(y_test, y_pred)) print("Classification report :") print(classification_report(y_test, y_pred)) print("Where classes are :") for label in label_classes: print(str(label) + " : " + label_classes[label]) # Neural Network part # creation of the callbacks to save the best model checkpointer = ModelCheckpoint(filepath="weights.hdf5", verbose=1, save_best_only=True) callbacks = [checkpointer] # Extracting the size of the data dimension_data = len(x_train_counts.toarray()[0]) # Creation of the model NN = model_creation(dimension_data) print("Training neural network, this may take while") # Training of the data NN.fit(x_train_counts.toarray(), to_categorical(y_train), epochs=10, validation_split=0.1, batch_size=128, callbacks=callbacks) # Loading the best model NN.load_weights('weights.hdf5') print("Printing neural network results") # Printing the results print("Accuracy score :") print(NN.evaluate(x_test_counts.toarray(), to_categorical(y_test))[1]) print("Confusion matrix :") confusion_matrix_NN(NN, x_test_counts.toarray(), to_categorical(y_test)) print("Classification report :") y_pred = NN.predict(np.array(x_test_counts.toarray())) y_test_class = np.argmax(to_categorical(y_test), axis=1) y_pred_class = np.argmax(y_pred, axis=1) print(classification_report(y_test_class, y_pred_class)) print("Where classes are :") for label in label_classes: print(str(label) + " : " + label_classes[label]) print( "The model is trained and the weights are saved at weights.hdf5, closing script" )
print(dane['text'].head().apply(process_text)) from sklearn.feature_extraction.text import CountVectorizer messages_bow = CountVectorizer(analyzer=process_text).fit_transform( dane['text']) messages_bow.shape print(messages_bow.shape) # zmienna celu y = df1['Class'].values #target X = df1.drop(['Class'], axis=1).values #features from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() (classifier.fit(X_train, y_train)) print(classifier.predict(X_train)) print(y_train.values) from sklearn.metrics import classification_report, confusion_matrix, accuracy_score pred = classifier.predict(X_train) print(classification_report(y_train, pred)) print('Confusion Matrix: \n', confusion_matrix(y_train, pred)) print() print('Accuracy: ', accuracy_score(y_train, pred)) print('Predicted value: ', classifier.predict(X_test)) print('Actual value: ', y_test.values)
shuffle=True) print(len(twenty_train.data)) print(len(twenty_test.data)) print(twenty_train.target_names) print("\n".join(twenty_train.data[0].split("\n"))) print(twenty_train.target[0]) from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_tf = count_vect.fit_transform(twenty_train.data) from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf) X_train_tfidf.shape from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score mod = MultinomialNB() mod.fit(X_train_tfidf, twenty_train.target) X_test_tf = count_vect.transform(twenty_test.data) X_test_tfidf = tfidf_transformer.transform(X_test_tf) predicted = mod.predict(X_test_tfidf) print("Accuracy:", accuracy_score(twenty_test.target, predicted)) print( classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)) print("confusion matrix is \n", confusion_matrix(twenty_test.target, predicted)) """ Downloading 20news dataset. This may take a few minutes. Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB) 2257
def test_calibration(): """Test calibration objects with isotonic and sigmoid""" n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) assert_raises(ValueError, pc_clf.fit, X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), (sparse.csr_matrix(X_train), sparse.csr_matrix(X_test))]: for method in ['isotonic', 'sigmoid']: pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) # Note that this fit overwrites the fit on the entire training # set pc_clf.fit(this_X_train, y_train, sample_weight=sw_train) prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1] # Check that brier score has improved after calibration assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss(y_test, prob_pos_pc_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] pc_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train) prob_pos_pc_clf_relabeled = \ pc_clf.predict_proba(this_X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_pc_clf, 1 - prob_pos_pc_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled)) # check that calibration can also deal with regressors that have # a decision_function clf_base_regressor = CalibratedClassifierCV(Ridge()) clf_base_regressor.fit(X_train, y_train) clf_base_regressor.predict(X_test) # Check failure cases: # only "isotonic" and "sigmoid" should be accepted as methods clf_invalid_method = CalibratedClassifierCV(clf, method="foo") assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train) # base-estimators should provide either decision_function or # predict_proba (most regressors, for instance, should fail) clf_base_regressor = \ CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid") assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
final_text = " ".join([token.lemma_ for token in doc]) print(final_text) return final_text # # data["modified_sentence"]=data["question"].apply(Cleaning) # print (data["modified_sentence"]) def generate_answer(predict_class): ans=random.choice(answer_dictionary[predict_class]) return ans from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB clf2 = MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True).fit(X1_train, Y_train) P=model.transform([Cleaning(question)]) predict2=clf2.predict(P) print (predict2) y_predict = clf2.predict(X1_test) print(accuracy_score(Y_test,y_predict)*100) # MLP MultiLevel Perception from sklearn.metrics import accuracy_score from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV clf6 = MLPClassifier(activation='relu',alpha=0.0019,hidden_layer_sizes=(300,), learning_rate='constant',power_t=1.5, solver='adam',random_state=15) clf6.fit(X1_train,Y_train)
# sys.exit(0) # print len(my_data), len(better_result) def test(text): print text return text # my_data = my_data[:len(my_data)-1] vect = CountVectorizer(charset_error='ignore', preprocessor=test) text_clf = Pipeline([ ('vect', vect), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ]) print "Dataset length: %s " % len(my_data) print("Training...") my_clf = text_clf.fit(my_data[:, 4], my_data[:, 3]) print "# Features: %s" % len(vect.vocabulary_) print("Done! \nClassifying test set...") predicted = my_clf.predict(my_test_data[:, 4]) print(np.mean(predicted == my_test_data[:, 3])) print "Accuracy: %.2f" % my_clf.score(my_data[:, 4], my_data[:, 3]) print "Accuracy: %.2f" % my_clf.score(predicted, my_test_data[:, 3])
train_data = train_data.drop(cols_to_drop, axis=1) #tussentijds resultaat print(train_data.head()) #hier zetten wij alles om naar lower character. en halen wij alle gekke tekens eruit train_data['text'] = train_data['text'].str.lower() train_data['text'] = train_data['text'].apply(lambda elem: re.sub( r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem)) train_data['text'] = train_data['text'].apply( lambda elem: re.sub(r"\d+", "", elem)) #tussentijds resultaat print(train_data.head()) #hier stellen wij de variabelen op om straks de tekst naar vectors om te zetten count_vectorizer = feature_extraction.text.CountVectorizer() #hier wordt onze train data omgezet naar vectors train_vectors = count_vectorizer.fit_transform(train_data["text"]) #voorbeeld van de vectors print(train_vectors) #hier wordt onze test data omgezet naar vectors test_vectors = count_vectorizer.transform(test_df["text"]) clf = MultinomialNB(alpha=1, fit_prior=True, class_prior=None) scores = model_selection.cross_val_score(clf, train_vectors, train_data["target"], cv=3, scoring="f1") print(scores)
def mod_knn_class(y_endogenous, x_exogenous, train_ratio=0.7, folds=5): """ :param y_endogenous: :param x_exogenous: :param train_ratio: :param folds: :return: """ random_state = 123 """Drop NaN""" y_endogenous.dropna(inplace=True) x_exogenous.dropna(inplace=True) """Transform data for LogReg fitting""" scaler = StandardScaler() std_data = scaler.fit_transform(x_exogenous.values) std_data = pd.DataFrame(std_data, index=x_exogenous.index, columns=x_exogenous.columns) """Shuffle Data for IMBALANCES""" from sklearn.utils import shuffle X_shuf, Y_shuf = shuffle(std_data, y_endogenous) X_shuf = X_shuf.as_matrix().astype(np.float) Y_shuf = Y_shuf.as_matrix().astype(np.int) """K-fold CV""" cv = StratifiedKFold(n_splits=folds, shuffle=False) """Establish Models Settings""" # White-Box: GLM lasso = LogisticRegression(penalty='l1', C=0.1, random_state=random_state, solver='liblinear', n_jobs=1) ridge = LogisticRegression(penalty='l2', C=0.1, random_state=random_state, solver='liblinear', n_jobs=1) log = LogisticRegression(class_weight='balanced', C=0.1, random_state=random_state, solver='liblinear', n_jobs=1) svc = SVC(C=0.1, kernel='linear', cache_size=100, shrinking=True, decision_function_shape='ovo', probability=True) # Black-Box: Bagging rfc = RandomForestClassifier(random_state=random_state, bootstrap=True, max_depth=80, criterion='entropy', min_samples_leaf=3, min_samples_split=10, n_estimators=500, max_features=None) gbc = GradientBoostingClassifier(learning_rate=0.5, n_estimators=250, min_samples_split=200, max_depth=3) # Non-Linear nb = GaussianNB() gpc = GaussianProcessClassifier() mnb = MultinomialNB() bnb = BernoulliNB(binarize=True) knn = KNeighborsClassifier(n_neighbors=2) """Storage List Dictionary for Models""" en_models = [{ 'label': 'K Neighbors Classifier', 'model': knn, 'dict_metrics': {}, }] """Loop Models""" for m in en_models: MOD = m['model'] print(m['label']) # AUC storage mean_tprs_y, mean_fpr_x = [], np.linspace(0, 1, 100) fprs_x, tprs_y, aucs = [], [], [] # Other Metrics Storage: Evaluation Metrics Dictionary dict_metrics = { 'fold_no': [], # 1 'acc_score': [], # 2 'jaccard_ind': [], # 3 'conf_matrix': [], # 4 'f1_score': [], # 5 'log_loss': [], # 6 'feat_coef': [], # 7 'feat_names': [], # 8 'fprs': [], 'tprs': [] } # Train / Test Split i = 1 # Start Loop for train_ind, test_ind in cv.split(X_shuf, Y_shuf): # Train Test Split X_train, X_test = X_shuf[train_ind], X_shuf[test_ind] y_train, y_test = Y_shuf[train_ind], Y_shuf[test_ind] # Fit Model MOD.fit(X_train, y_train) # ROC Curve fpr, tpr, thresholds = roc_curve( y_test, MOD.predict_proba(X_test).T[1]) roc_auc = auc(fpr, tpr) fprs_x.append(fpr) tprs_y.append(tpr) mean_tprs_y.append(interp(mean_fpr_x, fpr, tpr)) aucs.append(roc_auc) # Fold Number fold_no = i # Accuracy Score y_pred = MOD.predict(X_test) acc_score = metrics.accuracy_score(y_test, y_pred) # Jaccard Index j_index = jaccard_similarity_score(y_true=y_test, y_pred=y_pred) j_index_rnd = round(j_index, 2) # Confusion Matricsw cm = confusion_matrix(y_test, y_pred) # F1 Score f1 = f1_score(y_test, y_pred) # Log Loss lg_loss = log_loss(y_test, y_pred) # Feature Importance try: if m['label'] == 'Random Forest Classifier': feature_imp = pd.Series( rfc.feature_importances_, index=x_exogenous.columns).sort_values( ascending=False) feature_coef = pd.Series( feature_imp, index=x_exogenous.columns).sort_values( ascending=False) dict_metrics['feat_coef'].append(feature_coef.values) dict_metrics['feat_names'].append(feature_coef.index) elif m['label'] == 'Gradient Boost Classifier': feature_imp = pd.Series( gbc.feature_importances_, index=x_exogenous.columns).sort_values( ascending=False) feature_coef = pd.Series( feature_imp, index=x_exogenous.columns).sort_values( ascending=False) dict_metrics['feat_coef'].append(feature_coef.values) dict_metrics['feat_names'].append(feature_coef.index) elif m['label'] == 'none': pass else: # Feature Coefficients coefficients = MOD.coef_[0] feature_coef = pd.Series( coefficients, index=x_exogenous.columns).sort_values( ascending=False) dict_metrics['feat_coef'].append(feature_coef.values) dict_metrics['feat_names'].append(feature_coef.index) except Exception: # (Valueerror, Attribute Error) pass # Store Metrics dict_metrics['fold_no'].append(fold_no) dict_metrics['acc_score'].append(acc_score) dict_metrics['jaccard_ind'].append(j_index_rnd) dict_metrics['conf_matrix'].append(cm) dict_metrics['f1_score'].append(f1) dict_metrics['log_loss'].append(lg_loss) dict_metrics['fprs'].append(fpr) dict_metrics['tprs'].append(tpr) np.savetxt('/Users/Derrick-Vlad-/Desktop/' + 'FPR_KNN.csv', fpr, delimiter=",") np.savetxt('/Users/Derrick-Vlad-/Desktop/' + 'TPR_KNN.csv', tpr, delimiter=",") # Next Loop Indexer i = i + 1 # Store All Metrics m['dict_metrics'] = dict_metrics """End??????""" labels = [i['label'] for i in en_models if 'label' in i] eva_all = [i['dict_metrics'] for i in en_models if 'dict_metrics' in i] accuracy = [i['acc_score'] for i in eva_all if 'acc_score' in i] f1 = [i['f1_score'] for i in eva_all if 'f1_score' in i] fprss = [i['fprs'] for i in eva_all if 'fprs' in i] tprss = [i['tprs'] for i in eva_all if 'tprs' in i] logL = [i['log_loss'] for i in eva_all if 'log_loss' in i] confmatrix = [i['conf_matrix'] for i in eva_all if 'conf_matrix' in i] # Prepare Data-frame # ACCURACY acc = np.vstack(accuracy) acc = np.transpose(acc) df1 = pd.DataFrame(acc, columns=labels) # F1 Score f1 = np.vstack(f1) f1 = np.transpose(f1) df2 = pd.DataFrame(f1, columns=labels) # FALSE POSITIVE RATES #fprs = np.vstack(fprss) # [:, 0] OR [:, None] #fprs = np.transpose(fprs) #df3 = pd.DataFrame(fprs, columns=labels) print(fprss) # TRUE POSITIVE RATES #tprs = np.vstack(tprss) #tprs = np.transpose(tprs) #df4 = pd.DataFrame(tprs, columns=labels) print(tprss) # LOG LOSS SCORE logloss = np.vstack(logL) logloss = np.transpose(logloss) df5 = pd.DataFrame(logloss, columns=labels) # CONFUSION MATRIX # confmat = np.vstack(confmatrix) # confmat = np.transpose(confmat) # df6 = pd.DataFrame(confmat, columns=labels) print(confmatrix) results = Models() results.acc_score = df1 results.f1_score = df2 #results.fprs = df3 #results.tprs = df4 results.logloss = df5 # results.confmat = df6 return results
Informa atributos de bichos desconhecidos """ print(atributos_test.head(5)) #Quais são os bichos com os atributos acima? print(resultados_test.head(5)) """ 3. Fazer previsões """ # Fazer predições atributos_previsao = [0, 0, 0] dados_previsao = [atributos_previsao] # Criação do modelo modelo = MultinomialNB() modelo.fit(atributos_train, resultados_train) #resultado_previsao = modelo.predict(dados_previsao) #print("Bicho previsto:") #print(resultado_previsao) #print("Acurácia de " + str(accuracy_score(, resultado_previsao) * 100) + "%") """ A variável data representa um objeto Python que funciona como um dicionário. As chaves importantes do dicionário a considerar são: - os nomes dos rótulos de classificação (target_names) - os rótulos reais (target)
class TestModelTypeChecking(object): """ Test model type checking utilities """ ##//////////////////////////////////////////////////////////////////// ## is_estimator testing ##//////////////////////////////////////////////////////////////////// def test_estimator_alias(self): """ Assert isestimator aliases is_estimator """ assert isestimator is is_estimator @pytest.mark.parametrize("model", ESTIMATORS, ids=obj_name) def test_is_estimator(self, model): """ Test that is_estimator works for instances and classes """ assert inspect.isclass(model) assert is_estimator(model) obj = model() assert is_estimator(obj) @pytest.mark.parametrize("cls", [ list, dict, tuple, set, str, bool, int, float ], ids=obj_name) def test_not_is_estimator(self, cls): """ Assert Python objects are not estimators """ assert inspect.isclass(cls) assert not is_estimator(cls) obj = cls() assert not is_estimator(obj) def test_is_estimator_pipeline(self): """ Test that is_estimator works for pipelines """ assert is_estimator(Pipeline) assert is_estimator(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('linreg', LinearRegression()) ]) assert is_estimator(model) def test_is_estimator_search(self): """ Test that is_estimator works for search """ assert is_estimator(GridSearchCV) assert is_estimator(RandomizedSearchCV) model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']}) assert is_estimator(model) @pytest.mark.parametrize("viz,params", [ (Visualizer, {}), (ScoreVisualizer, {'model': LinearRegression()}), (ModelVisualizer, {'model': LogisticRegression()}) ], ids=lambda i: obj_name(i[0])) def test_is_estimator_visualizer(self, viz, params): """ Test that is_estimator works for Visualizers """ assert inspect.isclass(viz) assert is_estimator(viz) obj = viz(**params) assert is_estimator(obj) ##//////////////////////////////////////////////////////////////////// ## is_regressor testing ##//////////////////////////////////////////////////////////////////// def test_regressor_alias(self): """ Assert isregressor aliases is_regressor """ assert isregressor is is_regressor @pytest.mark.parametrize("model", REGRESSORS, ids=obj_name) def test_is_regressor(self, model): """ Test that is_regressor works for instances and classes """ assert inspect.isclass(model) assert is_regressor(model) obj = model() assert is_regressor(obj) @pytest.mark.parametrize("model", CLASSIFIERS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS, ids=obj_name) def test_not_is_regressor(self, model): """ Test that is_regressor does not match non-regressor estimators """ assert inspect.isclass(model) assert not is_regressor(model) obj = model() assert not is_regressor(obj) def test_is_regressor_pipeline(self): """ Test that is_regressor works for pipelines """ assert not is_regressor(Pipeline) assert not is_regressor(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('linreg', LinearRegression()) ]) assert is_regressor(model) @pytest.mark.xfail(reason="grid search has no _estimator_type it seems") def test_is_regressor_search(self): """ Test that is_regressor works for search """ assert is_regressor(GridSearchCV) assert is_regressor(RandomizedSearchCV) model = GridSearchCV(SVR(), {'kernel': ['linear', 'rbf']}) assert is_regressor(model) @pytest.mark.parametrize("viz,params", [ (ScoreVisualizer, {'model': LinearRegression()}), (ModelVisualizer, {'model': Ridge()}) ], ids=lambda i: obj_name(i[0])) def test_is_regressor_visualizer(self, viz, params): """ Test that is_regressor works on visualizers """ assert inspect.isclass(viz) assert not is_regressor(viz) obj = viz(**params) assert is_regressor(obj) ##//////////////////////////////////////////////////////////////////// ## is_classifier testing ##//////////////////////////////////////////////////////////////////// def test_classifier_alias(self): """ Assert isclassifier aliases is_classifier """ assert isclassifier is is_classifier @pytest.mark.parametrize("model", CLASSIFIERS, ids=obj_name) def test_is_classifier(self, model): """ Test that is_classifier works for instances and classes """ assert inspect.isclass(model) assert is_classifier(model) obj = model() assert is_classifier(obj) @pytest.mark.parametrize("model", REGRESSORS+CLUSTERERS+TRANSFORMERS+DECOMPOSITIONS, ids=obj_name) def test_not_is_classifier(self, model): """ Test that is_classifier does not match non-classifier estimators """ assert inspect.isclass(model) assert not is_classifier(model) obj = model() assert not is_classifier(obj) def test_classifier_pipeline(self): """ Test that is_classifier works for pipelines """ assert not is_classifier(Pipeline) assert not is_classifier(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('linreg', LogisticRegression()) ]) assert is_classifier(model) @pytest.mark.xfail(reason="grid search has no _estimator_type it seems") def test_is_classifier_search(self): """ Test that is_classifier works for search """ assert is_classifier(GridSearchCV) assert is_classifier(RandomizedSearchCV) model = GridSearchCV(SVC(), {'kernel': ['linear', 'rbf']}) assert is_classifier(model) @pytest.mark.parametrize("viz,params", [ (ScoreVisualizer, {'model': MultinomialNB()}), (ModelVisualizer, {'model': MLPClassifier()}) ], ids=lambda i: obj_name(i[0])) def test_is_classifier_visualizer(self, viz, params): """ Test that is_classifier works on visualizers """ assert inspect.isclass(viz) assert not is_classifier(viz) obj = viz(**params) assert is_classifier(obj) ##//////////////////////////////////////////////////////////////////// ## is_clusterer testing ##//////////////////////////////////////////////////////////////////// def test_clusterer_alias(self): """ Assert isclusterer aliases is_clusterer """ assert isclusterer is is_clusterer @pytest.mark.parametrize("model", CLUSTERERS, ids=obj_name) def test_is_clusterer(self, model): """ Test that is_clusterer works for instances and classes """ assert inspect.isclass(model) assert is_clusterer(model) obj = model() assert is_clusterer(obj) @pytest.mark.parametrize("model", REGRESSORS+CLASSIFIERS+TRANSFORMERS+DECOMPOSITIONS, ids=obj_name) def test_not_is_clusterer(self, model): """ Test that is_clusterer does not match non-clusterer estimators """ assert inspect.isclass(model) assert not is_clusterer(model) obj = model() assert not is_clusterer(obj) def test_clusterer_pipeline(self): """ Test that is_clusterer works for pipelines """ assert not is_clusterer(Pipeline) assert not is_clusterer(FeatureUnion) model = Pipeline([ ('reduce_dim', PCA()), ('kmeans', KMeans()) ]) assert is_clusterer(model) @pytest.mark.parametrize("viz,params", [ (ModelVisualizer, {'model': KMeans()}) ], ids=lambda i: obj_name(i[0])) def test_is_clusterer_visualizer(self, viz, params): """ Test that is_clusterer works on visualizers """ assert inspect.isclass(viz) assert not is_clusterer(viz) obj = viz(**params) assert is_clusterer(obj) ##//////////////////////////////////////////////////////////////////// ## is_gridsearch testing ##//////////////////////////////////////////////////////////////////// def test_gridsearch_alias(self): """ Assert isgridsearch aliases is_gridsearch """ assert isgridsearch is is_gridsearch @pytest.mark.parametrize("model", SEARCH, ids=obj_name) def test_is_gridsearch(self, model): """ Test that is_gridsearch works correctly """ assert inspect.isclass(model) assert is_gridsearch(model) obj = model(SVC, {"C": [0.5, 1, 10]}) assert is_gridsearch(obj) @pytest.mark.parametrize("model", [MLPRegressor, MLPClassifier, Imputer], ids=obj_name) def test_not_is_gridsearch(self, model): """ Test that is_gridsearch does not match non grid searches """ assert inspect.isclass(model) assert not is_gridsearch(model) obj = model() assert not is_gridsearch(obj) ##//////////////////////////////////////////////////////////////////// ## is_probabilistic testing ##//////////////////////////////////////////////////////////////////// def test_probabilistic_alias(self): """ Assert isprobabilistic aliases is_probabilistic """ assert isprobabilistic is is_probabilistic @pytest.mark.parametrize("model", [ MultinomialNB, GaussianNB, LogisticRegression, SVC, RandomForestClassifier, GradientBoostingClassifier, MLPClassifier, ], ids=obj_name) def test_is_probabilistic(self, model): """ Test that is_probabilistic works correctly """ assert inspect.isclass(model) assert is_probabilistic(model) obj = model() assert is_probabilistic(obj) @pytest.mark.parametrize("model", [ MLPRegressor, Imputer, StandardScaler, KMeans, RandomForestRegressor, ], ids=obj_name) def test_not_is_probabilistic(self, model): """ Test that is_probabilistic does not match non probablistic estimators """ assert inspect.isclass(model) assert not is_probabilistic(model) obj = model() assert not is_probabilistic(obj)
if word.lower() not in stopwords.words('english') ] return wordss data['text'].apply(process_text).head() data.head() #Splitting into training and testing data. Training data 70% x_train, x_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.3) #Creating the Model pipeline = Pipeline([ ('bow', CountVectorizer(analyzer=clean_text)), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB() ) # training on TF-IDF vectors with Naive Bayes classifier ]) #Training the model pipeline.fit(x_train, y_train) #Testing predictions = pipeline.predict(x_test) print(classification_report(y_test, predictions)) #Confusion Matrix sns.heatmap(confusion_matrix(y_test, predictions), annot=True)
bl1.update (ml6) ''' _trainfeatures, _trainlabels, _testfeatures, _testlabels = split(bf1, bl1) #(features, labels) = adapt (bf1, bl1) (trainfeatures, trainlabels) = adapt (_trainfeatures, _trainlabels) (testfeatures, testlabels) = adapt (_testfeatures, _testlabels) #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#GaussianProcessClassifier(), ExtraTreesClassifier(n_estimators=120), AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (ExtraTreesClassifier(n_estimators=128, random_state=0), AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) models = (RandomForestClassifier(n_estimators = 128, random_state=0), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB()) #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), )#GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB()) #fsets = (FSET_FULL,FSET_NOICC, FSET_MIN, FSET_YYY_G, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G) #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YY, FSET_YYY): fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YYY) #fsets = (FSET_FULL, FSET_Y, FSET_YYY) #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G) #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G) #fsets = (FSET_FULL, FSET_G, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
def modelTraining(X_train, X_test, y_train, y_test, f): models = {} # Linear SVC try: lsvc = LinearSVC() y_pred = lsvc.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Linear Support Vector Classifier"] = model_accr f.writelines( "\n Accuracy of Linear Support Vector Classifier is " + str(model_accr)) except: logging.info("LSVC is throwing exception") f.writelines("\n LSVC is throwing exception") # KNN try: knn = KNeighborsClassifier() y_pred = knn.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["KNN Classifier"] = model_accr f.writelines("\n Accuracy of KNN Classifier is " + str(model_accr)) except: logging.info("KNN is throwing exception") f.writelines("\n KNN is throwing exception") # DTC try: clf_gini = DecisionTreeClassifier(criterion="gini", random_state=0) y_pred = clf_gini.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Decision Tree Classifier - GINI"] = model_accr f.writelines( "\n Accuracy of Decision Tree Classifier - GINI is " + str(model_accr)) except: logging.info("DTC GINI is throwing exception") f.writelines("\n DTC GINI is throwing exception") try: clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=0) y_pred = clf_entropy.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Decision Tree Classifier - ENTROPY"] = model_accr f.writelines( "\n Accuracy of Decision Tree Classifier - ENTROPY is " + str(model_accr)) except: logging.info("DTC ENTROPY is throwing exception") f.writelines("\n DTC ENTROPY is throwing exception") # Multinomial NB try: mnb_model = MultinomialNB() y_pred = mnb_model.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Multinomial Naive Bayes"] = model_accr f.writelines("\n Accuracy of Multinomial NB is " + str(model_accr)) except: logging.info("Multinomial NB is throwing exception") f.writelines("\n Multinomial NB is throwing exception") # Bernoulli NB try: bnb_model = BernoulliNB() y_pred = bnb_model.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Bernoulli Naive Bayes"] = model_accr f.writelines("\n Accuracy of Bernoulli NB is " + str(model_accr)) except: logging.info("Bernoulli NB is throwing exception") f.writelines("\n Bernoulli NB is throwing exception") # Gaussian NB try: gnb_model = GaussianNB() y_pred = gnb_model.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Gaussian Naive Bayes"] = model_accr f.writelines("\n Accuracy of GaussianNB is " + str(model_accr)) except: logging.info("GaussianNB is throwing exception") f.writelines("\n GaussianNB is throwing exception") # ADB try: adb = AdaBoostClassifier(n_estimators=200, learning_rate=1) # Train Adaboost Classifer y_pred = adb.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["AdaBoost Classifier"] = model_accr f.writelines("\n Accuracy of AdaBoost Classifier is " + str(model_accr)) except: logging.info("AdaBoost Classifier is throwing exception") f.writelines("\n AdaBoost Classifier is throwing exception") # Random Forest Classifier try: rfc = RandomForestClassifier(n_estimators=100) y_pred = rfc.fit(X_train, y_train).predict(X_test) model_accr = metrics.accuracy_score(y_test, y_pred) * 100 models["Random Forest Classifier"] = model_accr f.writelines("\n Accuracy of Random Forest Classifier is " + str(model_accr)) except: logging.info("Random Forest Classifier is throwing exception") f.writelines( "\n Random Forest Classifier is throwing exception") return (models)
from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report df = pd.read_csv('spam_ham_dataset.csv', encoding="latin-1") #df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True) #df['label'] = df['v1'].map({'ham': 0, 'spam': 1}) X = df['text'] y = df['label_num'] cv = CountVectorizer() X = cv.fit_transform(X) # Fit the Data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #Naive Bayes Classifier clf = MultinomialNB() clf.fit(X_train, y_train) clf.score(X_test, y_test) y_pred = clf.predict(X_test) #print(classification_report(y_test, y_pred)) #from sklearn.externals import joblib #joblib.dump(clf, 'NB_spam_model.pkl') #NB_spam_model = open('NB_spam_model.pkl','rb') #clf = joblib.load(NB_spam_model) app = Flask(__name__) @app.route("/") def home1():
def bayes_classifier(data_train, class_labels_train): print("Fitting the classifier...") classifier = MultinomialNB(alpha=0.01) classifier.fit(data_train, class_labels_train) print("Classifier fitted...") return classifier
from sklearn.externals import joblib import pickle import logging import numpy as np test_case = load_files('reuter2/training') count_vect = CountVectorizer(decode_error='ignore', strip_accents='unicode') X_train_counts = count_vect.fit_transform(test_case.data) X_train_counts.shape tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_tfidf.shape clf = MultinomialNB().fit(X_train_tfidf, test_case.target) docs_new = [ 'I like bees', 'Construction of a unique downtown highrise that would provide both living and working space to local artists is still at least a year away from starting, project organizers say.' ] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, test_case.target_names[category])) text_clf = Pipeline([ ('vect', CountVectorizer(decode_error='ignore')), ('tfidf', TfidfTransformer()),
testing_set = featuresets[:100] # posterior = prior occurences * likelihood / evidence classifier = nltk.NaiveBayesClassifier.train(training_set) # load #classifier_f = open('naivebayes.pickle', 'rb') #classifier = pickle.load(classifier_f) #classifier_f.close() print('Original Naive Bayes Algo accuracy:', (nltk.classify.accuracy(classifier, testing_set)) * 100) classifier.show_most_informative_features(15) MNB_classiflier = SklearnClassifier(MultinomialNB()) MNB_classiflier.train(training_set) print('MNB_classiflier Naive Bayes Algo accuracy:', (nltk.classify.accuracy(MNB_classiflier, testing_set)) * 100) #GaussianNB, BernoulliNB #GaussianNB_classiflier = SklearnClassifier(GaussianNB()) #GaussianNB_classiflier.train(training_set) #print('GaussianNB_classiflier Naive Bayes Algo accuracy:', (nltk.classify.accuracy(GaussianNB_classiflier, testing_set))*100) BernoulliNB_classiflier = SklearnClassifier(BernoulliNB()) BernoulliNB_classiflier.train(training_set) print('BernoulliNB_classiflier Naive Bayes Algo accuracy:', (nltk.classify.accuracy(BernoulliNB_classiflier, testing_set)) * 100) #LogisticRegression, SGDClassifier
print(model.score(xtest,ytest)) # print(f1_score(ytest,y_pred)) # print(precision_score(ytest,y_pred)) # print(recall_score(ytest,y_pred)) # In[142]: from sklearn.naive_bayes import MultinomialNB # In[143]: nv=MultinomialNB() model_nv = nv.fit(xtrain,ytrain) y_pred_nv=model_nv.predict(xtest) print(accuracy_score(ytest,y_pred_nv)) # In[144]: from sklearn.tree import DecisionTreeClassifier # In[145]: dt=DecisionTreeClassifier()
# Filter for JJ (adjectives) train_txt_filtered = [filter_tag(i, 'JJ') for i in train_txt_tag] test_txt_filtered = [filter_tag(i, 'JJ') for i in test_txt_tag] # Lemmatization wnl = WordNetLemmatizer() train_txt_lemma = [ lemmatize(words=words, lemmatizer=wnl.lemmatize) for words in train_txt_filtered ] test_txt_lemma = [ lemmatize(words=words, lemmatizer=wnl.lemmatize) for words in test_txt_filtered ] # Counts and NB model with scikit learn train_txt_sk = [' '.join(words) for words in train_txt_lemma] test_txt_sk = [' '.join(words) for words in test_txt_lemma] text_pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('nb', MultinomialNB())]) text_pipeline.fit(X=train_txt_sk, y=train_y) pred = text_pipeline.predict(X=test_txt_sk) cfm = confusion_matrix(y_true=test_y, y_pred=pred) print(cfm)
def main(): ''' Notes for running: - written for python 2.7 -> change print statements if using 3 - required deps -> install scikit learn (google it) - edit filepaths ''' input_type = 'permissions' # good_path = '/home/josh/Documents/COSC/Research/APK_project/apk_repo/test_sets/large/v2/mal_badging_full_v2.txt' # mal_path = '/home/josh/Documents/COSC/Research/APK_project/apk_repo/test_sets/large/v2/benign_badging_full_v2.txt' results_dir = '/home/josh/Documents/COSC/Research/APK_project/DeepLearningResearch/Results/shallowResults/imbalanced-' good_path = "/home/noureldin/Desktop/workspace/freelancer/Olumerew/project1/DeepLearningResearch/Data/badging_med/ben_badging_med.txt" mal_path = "/home/noureldin/Desktop/workspace/freelancer/Olumerew/project1/DeepLearningResearch/Data/badging_med/mal_badging_med.txt" with open(good_path) as f: gdprm = f.readlines() with open(mal_path) as f: mlprm = f.readlines() features = gdprm + mlprm labels = np.array([]) for x in gdprm: labels = np.append(labels, 0) for x in mlprm: labels = np.append(labels, 1) token_pattern = None if input_type == 'hardware': #token_pattern = 'android\.hardware\.[^\']*' token_pattern = "(?:\w|\.)+(?:hardware).(?:\w|\.)+" elif input_type == 'permissions': #token_pattern = 'android\.permission\.[^\']*' #token_pattern = "(?<=name=\')[^(?:p)]*(?:permission)[^\']*" token_pattern = "(?:\w|\.)+(?:permission).(?:\w|\.)+" else: #token_pattern = 'android\.(?:hardware|permission)\.[^\']*' #token_pattern = "(?<=name=\')[^(?:p|h)]*(?:permission|hardware)[^\']*" token_pattern = "(?:\w|\.)+(?:permission|hardware).(?:\w|\.)+" print token_pattern #count_vect = CountVectorizer(input=u'content', analyzer=u'word', token_pattern=token_pattern) count_vect = CountVectorizer( analyzer=partial(regexp_tokenize, pattern=token_pattern)) time0 = timeit.default_timer() data_features = count_vect.fit_transform(features) time1 = timeit.default_timer() #time to tokenize print type(features) print data_features.get_shape() #for x in count_vect.get_feature_names(): # print x print 'tokenize time: ' + str(time1 - time0) print '\n' words = list( map(lambda feature: re.split(token_pattern, feature), features)) info = {'words': words, "labels": list(labels)} print('info=', json.dumps(info)) #proportion of data to test on vs total ratios = [.8, .6, .4, .2] columns = [ 'avg_acc', 'fpos_rate', 'fneg_rate', 'precision', 'recall', 'f1_score', 'avg_test_time', 'avg_train_time' ] indices = [.2, .4, .6, .8] print "BernoulliNB" bNBdf = pandas.DataFrame(columns=columns) print bNBdf for x in ratios: model_name = "BernoulliNB" BNclf = BernoulliNB() bNBdf = test_model(bNBdf, BNclf, data_features, labels, x) results_to_csv(bNBdf, model_name, results_dir, input_type) print '\n' print '---------------------------\n' print "MultiNomialNB" mnNBdf = pandas.DataFrame(columns=columns) #, index=indices) for x in ratios: model_name = "MultinomialNB" NBclf = MultinomialNB() mnNBdf = test_model(mnNBdf, NBclf, data_features, labels, x) results_to_csv(mnNBdf, model_name, results_dir, input_type) print '\n' print '---------------------------\n' print "DecisionTree" dtdf = pandas.DataFrame(columns=columns) #, index=indices) for x in ratios: model_name = "DecisionTree" DTclf = DecisionTreeClassifier() #min_samples_split = 20) dtdf = test_model(dtdf, DTclf, data_features, labels, x) results_to_csv(dtdf, model_name, results_dir, input_type) print '\n' print '---------------------------\n' print "LogisticRegression" lgdf = pandas.DataFrame(columns=columns) #, index=indices) for x in ratios: model_name = "Logistic_Regression" LRclf = LogisticRegression(C=10, solver='lbfgs') lgdf = test_model(lgdf, LRclf, data_features, labels, x) results_to_csv(lgdf, model_name, results_dir, input_type) print '\n' '''# alternative to shuffle_split