def test_integrated_plot_numpy_named_arrays(self): model = naive_bayes.MultinomialNB() X = np.array([ (1.1, 9.52, 1.23, 0.86, 7.89, 0.13), (3.4, 2.84, 8.65, 0.45, 7.43, 0.16), (1.2, 3.22, 6.56, 0.24, 3.45, 0.17), (3.8, 6.18, 2.45, 0.28, 2.53, 0.13), (5.1, 9.12, 1.06, 0.19, 1.43, 0.13), (4.4, 8.84, 4.97, 0.98, 1.35, 0.13), (3.2, 3.22, 5.03, 0.68, 3.53, 0.32), (7.8, 2.18, 6.87, 0.35, 3.25, 0.38), ], dtype=[('a', '<f8'), ('b', '<f8'), ('c', '<f8'), ('d', '<f8'), ('e', '<f8'), ('f', '<f8')]) y = np.array([1, 1, 0, 1, 0, 0, 1, 0]) visualizer = DecisionBoundariesVisualizer(model, features=['a', 'f']) visualizer.fit_draw_poof(X, y=y) self.assertEquals(visualizer.features_, ['a', 'f']) self.assert_images_similar(visualizer)
def count_vector(train_x, valid_x, train_y, valid_y): count_vect = CountVectorizer(analyzer='word', lowercase=False, token_pattern=r'\w{1,}') count_vect.fit(trainDF['text']) xtrain_count = count_vect.transform(train_x) xvalid_count = count_vect.transform(valid_x) accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count) accuracy1 = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count) accuracy2 = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count) accuracy3 = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count) accuracy4 = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc()) return accuracy, accuracy1, accuracy2, accuracy3, accuracy4
class NB_pipelined: p = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', naive_bayes.MultinomialNB()), ]) gs_clf = None def fit(self, train, y): text_clf = self.p.fit(train, y) parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3), } self.gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1) self.gs_clf = self.gs_clf.fit(train, y) return self.gs_clf def predict(self, x): return self.gs_clf.predict(x) def save(self, p): if not '.pk1' in p: path = p + '.pk1' else: path = p with open(path, 'wb') as output: pickle.dump(self.gs_clf, output, pickle.HIGHEST_PROTOCOL) def load(self, p): if not '.pk1' in p: path = p + '.pk1' else: path = p with open(path, 'rb') as input: self.gs_clf = pickle.load(input)
def test(): train_x, test_x, train_y, test_y = load_data() alphas = np.logspace(-2, 5, num=200) train_scores = [] test_scores = [] for alpha in alphas: cls = naive_bayes.MultinomialNB(alpha=alpha) cls.fit(train_x, train_y) train_scores.append(cls.score(train_x, train_y)) test_scores.append(cls.score(test_x, test_y)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(alphas, train_scores) ax.plot(alphas, test_scores) ax.legend(['Training Score', 'Testing Score'], loc='lower right') ax.set_xlabel("alpha") ax.set_ylabel("score") ax.set_ylim(0, 1.0) ax.set_title("MultinomialNB") ax.set_xscale("log") plt.show()
def tf_idf_word(train_x, valid_x, train_y, valid_y): tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000) tfidf_vect.fit(trainDF['text']) xtrain_tfidf = tfidf_vect.transform(train_x) xvalid_tfidf = tfidf_vect.transform(valid_x) accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf) #print("NB, WordLevel TF-IDF: ", accuracy) accuracy1 = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf) #print("LR, WordLevel TF-IDF: ", accuracy1) accuracy2 = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf) #print("SVM, WordLevel TF-IDF: ", accuracy2) accuracy3 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf) #print("RF, WordLevel TF-IDF: ", accuracy3) accuracy4 = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc()) #print("Xgb, WordLevel TF-IDF: ", accuracy4) return accuracy, accuracy1, accuracy2, accuracy3, accuracy4
def model_builder(): labels, texts = [], [] #reading good and Bad dataset file with open("Dataset//BadWords.txt") as fp: data = fp.readlines() for abc in data: labels.append("0") texts.append(abc) with open("Dataset//Goodwords.txt") as fp: data = fp.readlines() for abc in data: labels.append("1") texts.append(abc) trainDF = pandas.DataFrame() trainDF['text'] = texts trainDF['label'] = labels # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = model_selection.train_test_split( trainDF['text'], trainDF['label']) # label encode the target variable encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) valid_y = encoder.fit_transform(valid_y) # characters level tf-idf tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=5000) tfidf_vect_ngram_chars.fit(trainDF['text']) xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x) xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x) accuracy = train_model(valid_y, naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars) return str(accuracy)
def __available_classifiers(): available_clfs = dict() # features of all available classifiers Classifier = collections.namedtuple('Classifier', [ 'idf', 'full_name', 'function_call', 'scaling_possible', 'predict_proba', 'numeric_labels' ]) available_clfs["svm"] = Classifier("svm", "Support Vector Machine", svm.SVC(probability=True), True, True, False) available_clfs["svm_gs1"] = Classifier( "svm", "Co-best SVM according to Skll Grid Search", svm.SVC(probability=True, kernel="sigmoid", C=0.1, coef0=0.01, gamma=0.01), True, True, False) available_clfs["svm_gs2"] = Classifier( "svm", "Co-best SVM according to Skll Grid Search", svm.SVC(probability=True, kernel="sigmoid", C=0.01, coef0=0.01, gamma=0.0), True, True, False) available_clfs["mnb"] = Classifier( "mnb", "Multinomial Naive Bayes", naive_bayes.MultinomialNB(), False, True, False ) # MNB can't do default scaling: ValueError: Input X must be non-negative available_clfs["knn"] = Classifier("knn", "k Nearest Neighbour", neighbors.KNeighborsClassifier(), True, True, False) # knn can do feature scaling available_clfs["raf"] = Classifier( "raf", "Random Forest", ensemble.RandomForestClassifier(n_estimators=15, max_depth=5, oob_score=True), True, True, False) return available_clfs
def run(fold): df = pd.read_csv("../inputs/IMDB_Dataset-folds.csv") df_train = df[df.kfold != fold].reset_index(drop=True) df_valid = df[df.kfold == fold].reset_index(drop=True) count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None) count_vec.fit(df_train.review) x_train = count_vec.transform(df_train.review) x_valid = count_vec.transform(df_valid.review) model = naive_bayes.MultinomialNB() model.fit(x_train, df_train.sentiment.values) acc = model.score(x_valid, df_valid.sentiment.values) print(f"Fold: {fold}, Accuracy: {acc}")
def __init__(self, trainDF): super().__init__() prePro = PreProcessor() self.pf = PlotFunctions() self.trainDF = trainDF self.X_train, self.X_test, self.y_train, self.y_test = \ prePro.split_train_test(trainDF['cleaned_sentence'], trainDF['classification'], 0.4) self.X_test, self.X_cross, self.y_test, self.y_cross = \ prePro.split_train_test(self.X_test, self.y_test, 0.5) self.all_scores = list() self.models = { 'MultinomialNB': naive_bayes.MultinomialNB(alpha=0.767, class_prior=None, fit_prior=True), 'ComplementNB': naive_bayes.ComplementNB(alpha=0.767, class_prior=None, fit_prior=True), 'LogisticRegression': linear_model.LogisticRegression(solver='lbfgs') }
def train_model_write(input_dataset, train_model_path, payload_col_name, payload_label): #print(''+train_model_path) trainDF = load_cvs_dataset(input_dataset) txt_label = trainDF[payload_label] txt_text = trainDF[payload_col_name] model_input = count_vectorizer(txt_text, txt_label) naive = naive_bayes.MultinomialNB() accuracy = train_model(naive, model_input[0], model_input[1], model_input[2], model_input[3]) dirs = os.listdir(train_model_path) file_no = len(dirs) pickle.dump( naive, open( str(train_model_path) + "text_classifier-" + str(file_no) + ".pickle", "wb")) pickle.dump( model_input[4], open( str(train_model_path) + "tfidf-" + str(file_no) + ".pickle", "wb")) return accuracy * 100
def context_search(documents, ids, query): global ready_states, context_ids docs_new = [query] text_clf = pipeline.Pipeline([ ('vect', feature_extraction.text.CountVectorizer()), ('tfidf', feature_extraction.text.TfidfTransformer()), ('clf', naive_bayes.MultinomialNB()), ]) for i in range(1, 3): if len(documents) > 0 and len(ids) > 0 and len(documents) == len(ids): text_context = text_clf.fit(documents, ids) text_id = text_context.predict(docs_new) found_id = int(text_id.astype(int)) if found_id not in context_ids: context_ids.append(found_id) index = ids.index(found_id) del(documents[index]) del(ids[index]) ready_states.append(True)
def train_sentiment_classifier(trainingtext): ''' trains a naive bayes classifier to train on sentiment. parameters: - trainingtext(.csv/.txt), needs to be annotated ''' df = pd.read_csv('training.txt', sep='\t', names=['liked', 'txt']) # vectorize words stopset = set(stopwords.words('english')) vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset) # target y = df.liked # samples X = vectorizer.fit_transform(df.txt) # split dataset X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # train the naive bayes classifier clf = naive_bayes.MultinomialNB() clf.fit(X_train, y_train) return clf
def bayes_model(model_type='m'): from sklearn import naive_bayes if model_type == 'b': model = naive_bayes.BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) elif model_type == 'g': model = naive_bayes.GaussianNB() # 高斯贝叶斯 else: model = naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) """ 文本分类问题常用MultinomialNB 参数 --- alpha:平滑参数 fit_prior:是否要学习类的先验概率;false-使用统一的先验概率 class_prior: 是否指定类的先验概率;若指定则不能根据参数调整 binarize: 二值化的阈值,若为None,则假设输入由二进制向量组成 """ return model
def analyze(stop_words_): if stop_words_ == "": vectorizer = feature_extraction.text.CountVectorizer(stop_words=None) else: vectorizer = feature_extraction.text.CountVectorizer(stop_words=stop_words_) X = vectorizer.fit_transform(data['title_reviews_combo']) x_train, x_test, y_train, y_test = model_selection.train_test_split(X, data['Recommended IND']) NB = naive_bayes.MultinomialNB() NB.fit(x_train, y_train) y_predict = NB.predict(x_test) unique, counts = np.unique(y_predict, return_counts=True) dict(zip(unique, counts)) score_train = NB.score(x_train, y_train) score_test = NB.score(x_test, y_test) recall_test = metrics.recall_score(y_test, NB.predict(x_test)) precision_test = metrics.precision_score(y_test, NB.predict(x_test)) return score_train, score_test, recall_test, precision_test
def my_MultinomialNB_alpha(*data): train_x, test_x, train_y, test_y = data alphas = np.logspace(-2, 5, num=200) train_scores = [] test_scores = [] for alpha in alphas: cls = naive_bayes.MultinomialNB(alpha=alpha) cls.fit(train_x, train_y) train_scores.append(cls.score(X=train_x, y=train_y)) test_scores.append(cls.score(X=test_x, y=test_y)) # 绘图 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(alphas, train_scores, label="Training Score") ax.plot(alphas, test_scores, label="Tesing Score") ax.set_xlabel(r"$\alpha$") ax.set_xscale("log") ax.set_ylabel("score") ax.set_ylim(0, 1.0) ax.set_title("MultinomialNB") ax.legend(loc="best") plt.show()
def test_MultinomialNB_alpha(): alphas = np.logspace(-2, 5, num=20) train_score = [] test_score = [] for alpha in alphas: cls = naive_bayes.MultinomialNB(alpha=alpha) cls.fit(X_train, y_train) train_score.append(cls.score(X_train, y_train)) test_score.append(cls.score(X_test, y_test)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(alphas, train_score, label="training score") ax.plot(alphas, test_score, label="testing score") ax.set_xlabel(r'$\alpha$') ax.set_ylabel('score') # ax.set_ylim(0, 1.0) # todo: 为什么没有显示label ax.set_title("MultinomialNB") ax.set_xscale('log') plt.show()
def execute_other_model(X_train, y_train, X_test, y_test): print('\n\033[1m\033[94m Accuracy Of Other Models: \033[0m\n') print(symptoms_name) user_input_preds = [] models_accuracy.clear() user_input_result_val.clear() user_input_result_val.append(user_input) # Train each model one by one and get result accordingly prediction, accuracy, user_input_preds = train_model_linear_regression( X_train, y_train, X_test, y_test) prepare_result_to_display(models_name[0], accuracy, user_input_preds) prediction, accuracy, user_input_preds = train_model_others( KNeighborsRegressor(n_neighbors=5), X_train, y_train, X_test, y_test) prepare_result_to_display(models_name[1], accuracy, user_input_preds) prediction, accuracy, user_input_preds = train_model_others( svm.SVC(), X_train, y_train, X_test, y_test) prepare_result_to_display(models_name[2], accuracy, user_input_preds) prediction, accuracy, user_input_preds = train_model_others( linear_model.LogisticRegression(), X_train, y_train, X_test, y_test) prepare_result_to_display(models_name[3], accuracy, user_input_preds) prediction, accuracy, user_input_preds = train_model_others( ensemble.RandomForestClassifier(), X_train, y_train, X_test, y_test) prepare_result_to_display(models_name[4], accuracy, user_input_preds) prediction, accuracy, user_input_preds = train_model_others( xgboost.XGBClassifier(), X_train, y_train, X_test, y_test) prepare_result_to_display(models_name[5], accuracy, user_input_preds) prediction, accuracy, user_input_preds = train_model_others( naive_bayes.MultinomialNB(), X_train, y_train, X_test, y_test) prepare_result_to_display(models_name[6], accuracy, user_input_preds)
def main(): data_column = 'text' processed_column = 'text_final' target = 'target' print("Preprocessing...") Corpus = sentence_preprocessing(pd.read_csv('../data/disaster-tweets.csv'), data_column, processed_column) Vectorizers = [TfidfVectorizer(max_features=5000), CountVectorizer()] Vectorizer_Columns = ["tfidf", "count"] Models = [ naive_bayes.MultinomialNB(), svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto'), LogisticRegression() ] Accuracies = [] print("Splitting data...") Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split( Corpus[processed_column], Corpus[target], test_size=0.3) for Model in Models: current_accuracies = [] for index, Vectorizer in enumerate(Vectorizers): Corpus[Vectorizer_Columns[index]] = Corpus[processed_column] print("Vectorizing", "...") (Train_X_Vectorized, Test_X_Vectorized) = vectorize(Vectorizer, Corpus, Vectorizer_Columns[index], Train_X, Test_X) print("Generating predictions", "...") score = generate_predictions(Model, Train_X_Vectorized, Test_X_Vectorized, Train_Y, Test_Y) print(Model, " with vectorizer ", Vectorizer_Columns[index], " Accuracy Score -> ", score) current_accuracies.append(round(score, 2)) Accuracies.append(current_accuracies) print("Accuracies: ", Accuracies)
def predict_model(train_df, test_df): train_data, train_y, test_data, test_y = data_processing(train_df, test_df) cv = CountVectorizer() train_tfmat = cv.fit_transform(train_data) tf = TfidfTransformer() train_x = tf.fit_transform(train_tfmat) test_tfmat = cv.transform(test_data) test_x = tf.transform(test_tfmat) model_nb = nb.MultinomialNB() model_lr = LogisticRegression() model_nn = MLPClassifier(hidden_layer_sizes=(100,100), early_stopping=True) model_names = ['NN', 'NB', 'LR'] models = [model_nn, model_nb, model_lr] for _, clf in enumerate(models): print("Model {}: {}".format(_+1, model_names[_])) clf.fit(train_x, train_y) y_pred = clf.predict(test_x) result = eval_model(test_y, y_pred) print(result)
def train_u(): vu = TfidfVectorizer(use_idf=False, lowercase=False) doc = pd.read_csv(file_to_read, sep='\t', names=['review', 'sentiment']) class_categ_u = doc.sentiment # positive and negative classes token_u = vu.fit_transform(doc.review) token_u_train, token_u_test, class_u_train, class_u_test = train_test_split(token_u, class_categ_u, random_state=40) # training the naive bayes classifier naive_train_u = naive_bayes.MultinomialNB() naive_train_u.fit(token_u_train, class_u_train) # training the logistic regression classifier log_train_u = LogisticRegression(penalty='l2', C=1) log_train_u.fit(token_u_train, class_u_train) print("Logistic Regression classifier accuracy with unnormalized data is %2.2f" % accuracy_score(class_u_test, log_train_u.predict(token_u_test))) print("Naive Bayes classifier accuracy with unnormalized data is %2.2f" % roc_auc_score(class_u_test, naive_train_u.predict(token_u_test))) return naive_train_u, log_train_u, vu
def main(): tic = time.time() vectorizer = text.CountVectorizer(ngram_range=(1, 1)) X_train, y_train, X_test, y_test = load_text_data(sys.argv[1], sys.argv[2], vectorizer) print("\nLOGISTIC REGRESSION CLASSIFIER") model_results(linear_model.LogisticRegression(), X_train, y_train, X_test, y_test) print("\n\nNAIVE BAYES CLASSIFIER") model_results(naive_bayes.MultinomialNB(), X_train, y_train, X_test, y_test) #print("\n\nK-NEIGHBORS CLASSIFIER") #model_results(neighbors.KNeighborsClassifier(), X_train, y_train, X_test, y_test) print("\n\nLINEAR SVC CLASSIFIER") model_results(svm.LinearSVC(), X_train, y_train, X_test, y_test) toc = time.time() print("\n" + str(int((toc - tic) // 60)) + "m " + str(int(toc - tic) % 60) + "s")
def nb_classifier(X, y, sw=False, checkpoint=True): # stopwords stop_words = set(stopwords.words('english')) if sw else None # initialize model to vectorize vec = TfidfVectorizer(lowercase=True, use_idf=True, norm=None, smooth_idf=False, analyzer='word', input='content', stop_words=stop_words, min_df=10, max_features=20000) # initialize mnb_clf = naive_bayes.MultinomialNB() # Pipeline vec_nb = Pipeline([('vectorize', vec), ('mnb', mnb_clf)]) # fit model vec_nb.fit(X, y) # save model if checkpoint: filename = '/Mining_The_Social_Web/models/nbtfidf.sav' joblib.dump(vec_nb, filename) return vec_nb
def build_NB_classifier(X_training, y_training): ''' Build a Naive Bayes classifier based on the training set X_training, y_training, optimized for the hyperparameters passed. @param X_training: X_training[i,:] is the ith example y_training: y_training[i] is the class label of X_training[i,:] @return nbc: the naive bayes classifier built in this function results: the dict of scores returned by cross validation, since GridSearchCV would also return this but it cannot be used for NB with no hyperparameter to optimize, and CV must be done before fitting takes place (and fitting happens here) ''' print_clf_intro("NAIVE BAYES") # Instantiate a Multinomial NB classifier. nbc = naive_bayes.MultinomialNB() # Perform cross validation and store results. results = model_selection.cross_validate(nbc, X_training, y_training, return_train_score=True, scoring=['accuracy', 'precision', 'roc_auc', 'recall', 'f1']) # Fit the data with X-training. nbc.fit(X_training, y_training) # Return the classifier object and CV results. return nbc, results
def naive_bayes(self): from sklearn.model_selection import GridSearchCV parameters = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} clf_log = naive_bayes.MultinomialNB() clf_log = GridSearchCV(clf_log, parameters, cv=5) accuracy, f1 = self.train_model(clf_log, self.train_x_count, self.train_y, self.valid_x_count) print("Logistic Regression (Count Vectors)", accuracy, f1) accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf, self.train_y, self.xvalid_tfidf) print("Logistic Regression)", accuracy, f1) accuracy, f1 = self.train_model(clf_log, self.xtrain_tfidf_ngram, self.train_y, self.xvalid_tfidf_ngram) print("Logistic Regression (TDIDF-ngram)", accuracy, f1) print("Best parameters! -", clf_log.best_params_) return clf_log
def NBunnormalized(newfile): #reading file filename = "amazon_cells_labelled.txt" folder = pd.read_csv(filename, sep="\t", names=["docs", "class"]) #Convert a collection of raw documents to a matrix of TF-IDF features. Vectwords = TfidfVectorizer(use_idf=False, lowercase=False, strip_accents="ascii") y = folder['class'] #Learn vocabulary and idf, return term-document matrix. x = Vectwords.fit_transform(folder.docs) #Training data using given trained files x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=40) clf = nb.MultinomialNB() clf.fit(x_train, y_train) score = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1]) predict = clf.predict(x_test) acc = accuracy_score(y_test, predict) #testing given file classifier = np.array(newfile) classifier_vect = Vectwords.transform(classifier) pre = clf.predict(classifier_vect) #print(pre) print("Accuracy: ", acc * 100) print("Score :", score) file = open("results-nb-u.txt", "a") for i in pre: print(i) file.write(str(i) + "\n") file.close()
def bayes_predict(tr_smp, tr_lb, test_smp, all_wrd): Train_X = tr_smp Train_Y = tr_lb Test_X = test_smp # Metoda TF-IDF Tfidf_vect = TfidfVectorizer(max_features=5000, strip_accents='unicode', ngram_range=(1, 3), max_df=0.9, min_df=5, sublinear_tf=True) Tfidf_vect.fit(all_wrd) Train_X = Tfidf_vect.transform(Train_X) Test_X = Tfidf_vect.transform(Test_X) # Normalizare Train_X = normalize(Train_X, axis=1, norm='l1') Test_X = normalize(Test_X, axis=1, norm='l1') # Standardizare #scaler = preprocessing.Normalizer() scaler = preprocessing.RobustScaler(quantile_range=(0.1, 0.9), with_centering=False) Train_X = scaler.fit_transform(Train_X) Test_X = scaler.fit_transform(Test_X) model = naive_bayes.MultinomialNB(alpha=0.0001) model.fit(Train_X, Train_Y) # predict the labels on validation dataset predictions = model.predict(Test_X) predictions = make_ints(predictions) return predictions
train_vec = scaler.fit_transform(train_vec) test_vec = scaler.fit_transform(test_vec) # Model training if training_model == 'RF' or training_model == "BT": # Initialize the random Forest or bagged tree based the model chosen rfc = RFC(n_estimators=100, oob_score=True, max_features=(None if trainging_model == "BT" else "auto")) print("Training %s" % ("Random Forest" if training_model == "RF" else "bagged tree")) rfc = rfc.fit(train_vec, train_data.sentiment) print("OOB Score = ", rfc.oob_score) pred = rfc.predict(test_vec) elif training_model == "NB": nb = naive_bayes.MultinomialNB() cv_score = cross_val_score(nb, train_vec, train_data.sentiment, cv=10) print("Training Naive Bayes") print("cv score = ", cv_score.mean()) nb = nb.fit(train_vec, train_data.sentiment) pred = nb.predict(test_vec) elif training_model == 'SVM': svc = svm.LinearSVC() param = { 'C': [1e15, 1e13, 1e11, 1e9, 1e7, 1e5, 1e3, 1e1, 1e-1, 1e-3, 1e-5] } print("Training SVM") svc = GridSearchCV(svc, param, cv=10) svc = svc.fit(train_vec, train_data.sentiment) pred = svc.predict(test_vec)
is_neural_net=False): # fit the training dataset on the classifier classifier.fit(feature_vector_train, label) # predict the labels on validation dataset predictions = classifier.predict(feature_vector_valid) if is_neural_net: predictions = predictions.argmax(axis=-1) return metrics.accuracy_score(predictions, self.valid_y) if __name__ == '__main__': start = TextProcessing() result = start.train_model(naive_bayes.MultinomialNB(), start.xtrain_count, start.train_y, start.xvalid_count) print("naive_bayes, Count Vectors: ", result) result = start.train_model(naive_bayes.MultinomialNB(), start.xtrain_tfidf, start.train_y, start.xvalid_tfidf) print("naive_bayes, WordLevel TF-IDF: ", result) print( '-------------------------------------------------------------------------------------------------------' ) result = start.train_model(linear_model.LogisticRegression(), start.xtrain_count, start.train_y, start.xvalid_count) print("LogisticRegression, Count Vectors: ", result) result = start.train_model(linear_model.LogisticRegression(), start.xtrain_tfidf, start.train_y, start.xvalid_tfidf)
current_features = np.zeros(len(partial_mapping)) for instr in json_data['instructions']: mnemonic = instr.split(" ")[0].rstrip() current_features[ partial_mapping[mnemonic] if mnemonic in partial_mapping else partial_mapping['<UNK>']] += 1 train_x.append(current_features) train_y_opt.append(json_data['opt']) train_y_cmp.append(json_data['compiler']) train_input_file.close() opt_model = naive_bayes.MultinomialNB() opt_model.fit(train_x, train_y_opt) cmp_model = naive_bayes.MultinomialNB() cmp_model.fit(train_x, train_y_cmp) test_path = 'datasets/test_dataset_blind.jsonl' output_path = '1711234.csv' test_input_file = open(test_path, mode='r') output_file = open(output_path, mode='w') for json_line in test_input_file: json_data = json.loads(json_line) current_features = np.zeros(len(partial_mapping))
def runMNB(train_X, train_y, test_X, test_y, test_X2): model = naive_bayes.MultinomialNB() model.fit(train_X, train_y) pred_test_y = model.predict_proba(test_X) pred_test_y2 = model.predict_proba(test_X2) return pred_test_y, pred_test_y2, model