def __init__(self, MODEL, train_x, train_y, test_x, test_y): #---data---# self.train_x = train_x self.train_y = train_y self.test_x = test_x self.test_y = test_y #---model---# self.cross_validate = False self.MODEL = MODEL if self.MODEL == 'NEWS': self.models = { 'Guassian': GaussianNB(), 'Multinominal': MultinomialNB(alpha=0.065), 'Complement': ComplementNB(alpha=0.136), 'Bernoulli': BernoulliNB(alpha=0.002) } if self.MODEL == 'MUSHROOM': ALPHAS = ALPHAS_MUSHROOM self.models = { 'Guassian': GaussianNB(), 'Multinominal': MultinomialNB(alpha=0.0001), 'Complement': ComplementNB(alpha=0.0001), 'Bernoulli': BernoulliNB(alpha=0.0001) } if self.MODEL == 'INCOME': self.cross_validate = True self.models = { 'Guassian': GaussianNB(), 'Multinominal': MultinomialNB(alpha=0.959), 'Complement': ComplementNB(alpha=0.16), 'Bernoulli': BernoulliNB(alpha=0.001) }
def build_pipeline(train, dev, clf): x_train, y_train = prepare_set(train) x_dev, y_dev = prepare_set(dev) bow = CountVectorizer(max_features=2000) tfidf = TfidfTransformer() if clf == "SVM": clf = SVC(C=10, gamma=1, kernel="rbf") elif clf == "RF": clf = RandomForestClassifier(n_estimators=25, max_depth=35) elif clf == "NB": clf = ComplementNB(norm=False) elif clf == "R": clf = RecallBiasedEstimator([ SVC(C=10, gamma=1, kernel="rbf"), RandomForestClassifier(n_estimators=10, max_depth=35), ComplementNB(norm=False) ]) pipeline = Pipeline([('bow', bow), ('tfidf', tfidf), ('clf', clf)]) pipeline.fit(x_train, y_train) return pipeline
def __init__(self, col_stats, data_type=None): """ Chose the algorithm to use for the rest of the model As of right now we go with ComplementNB """ self._X_buff = [] self._Y_buff = [] self._predicted_buckets_buff = [] self._real_buckets_buff = [] self._original_real_buckets_buff = [] self._original_predicted_buckets_buff = [] self.col_stats = col_stats if 'percentage_buckets' in col_stats: self._probabilistic_model = MultinomialNB( alpha=self._smoothing_factor) self.buckets = col_stats['percentage_buckets'] self.bucket_keys = [i for i in range(len(self.buckets))] if len(self.buckets) < 3: self._probabilistic_model = ComplementNB( alpha=self._smoothing_factor) else: self._probabilistic_model = ComplementNB( alpha=self._smoothing_factor) self.buckets = None self.data_type = col_stats['data_type'] self.bucket_accuracy = {}
def predict_classifier(name_dataset, name_train, classifier, name_test, metric): """Run classifier""" if classifier == "ada_boost": estimator = AdaBoostClassifier(random_state=42, base_estimator=ComplementNB(alpha=0.01)) #estimator = AdaBoostClassifier(random_state=42, base_estimator= LogisticRegression(C= 50, max_iter= 100)) elif classifier == "extra_tree": estimator = ExtraTreesClassifier(random_state=SEED) elif classifier == "knn": estimator = KNeighborsClassifier() elif classifier == "logistic_regression": estimator = LogisticRegression(random_state=SEED) elif classifier == "naive_bayes": estimator = MultinomialNB() elif classifier == "naive_bayes_complement": estimator = ComplementNB() elif classifier == "passive_aggressive": estimator = PassiveAggressiveClassifier(random_state=SEED, max_iter=1000) elif classifier == "random_forest": estimator = RandomForestClassifier(random_state=SEED) elif classifier == "sgd": estimator = SGDClassifier(random_state=SEED, max_iter=1000) elif classifier == "svm": estimator = svm.LinearSVC(random_state=SEED, max_iter=1000) x_train, y_train, x_test, y_test = load_svmlight_files( [open(name_train, 'rb'), open(name_test, 'rb')]) load_estimator = False if load_estimator == True: joblib.load("escores/grid_" + name_dataset + "_" + classifier) # load estimator else: if not (len(classifier.split(",")) > 1): escores = cv.load_escores(name_dataset, classifier, 1) # test score 0 best_param_folds = cv.best_param_folds_no_frequency( escores, 0, metric) # best score per fold estimator.set_params(**best_param_folds) estimator.fit(x_train, y_train) y_pred = estimator.predict(x_test) cv.save_dict_list([y_test], [y_pred], 'y_pred/' + name_dataset + "_" + classifier + "_" + metric + "_" + cv.name_file(name_test))
def stacking_ensemble(X, y): cnb1 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', ComplementNB(alpha=0.347))]) cnb2 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', ComplementNB(alpha=0.347))]) rf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=1000, max_depth=14, n_jobs=-1)), ]) knn = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', KNeighborsClassifier(n_neighbors=100, n_jobs=-1)), ]) xgb = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', XGBClassifier(objective='multi:softmax', num_class=20, n_jobs=-1)), ]) lr = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial')), ]) lgbm = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)), ('clf', LGBMClassifier(objective='multiclass', reg_lambda=1e-6, num_leaves=150, n_estimators=200, learning_rate=0.07))]) meta = OneVsRestClassifier(LinearSVC(class_weight='balanced'), n_jobs=-1) sclf = StackingClassifier(classifiers=[cnb1, cnb2, rf, knn, xgb, lgbm], meta_classifier=meta, use_probas=True) # ---------------------------- 4 Fold CV --------------------------------- scores = model_selection.cross_val_score(sclf, X, y, cv=4, scoring='accuracy', n_jobs=-1) print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
def test_cnb(): # Tests ComplementNB when alpha=1.0 for the toy example in Manning, # Raghavan, and Schuetze's "Introduction to Information Retrieval" book: # https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html # Training data points are: # Chinese Beijing Chinese (class: China) # Chinese Chinese Shanghai (class: China) # Chinese Macao (class: China) # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo. X = np.array([[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]) # Classes are China (0), Japan (1). Y = np.array([0, 0, 0, 1]) # Check that weights are correct. See steps 4-6 in Table 4 of # Rennie et al. (2003). theta = np.array([[(0 + 1) / (3 + 6), (1 + 1) / (3 + 6), (1 + 1) / (3 + 6), (0 + 1) / (3 + 6), (0 + 1) / (3 + 6), (1 + 1) / (3 + 6)], [(1 + 1) / (6 + 6), (3 + 1) / (6 + 6), (0 + 1) / (6 + 6), (1 + 1) / (6 + 6), (1 + 1) / (6 + 6), (0 + 1) / (6 + 6)]]) weights = np.zeros(theta.shape) normed_weights = np.zeros(theta.shape) for i in range(2): weights[i] = -np.log(theta[i]) normed_weights[i] = weights[i] / weights[i].sum() # Verify inputs are nonnegative. clf = ComplementNB(alpha=1.0) msg = re.escape('Negative values in data passed to ComplementNB (input X)') with pytest.raises(ValueError, match=msg): clf.fit(-X, Y) clf.fit(X, Y) # Check that counts/weights are correct. feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]]) assert_array_equal(clf.feature_count_, feature_count) class_count = np.array([3, 1]) assert_array_equal(clf.class_count_, class_count) feature_all = np.array([1, 4, 1, 1, 1, 1]) assert_array_equal(clf.feature_all_, feature_all) assert_array_almost_equal(clf.feature_log_prob_, weights) clf = ComplementNB(alpha=1.0, norm=True) clf.fit(X, Y) assert_array_almost_equal(clf.feature_log_prob_, normed_weights)
def voting_ensemble(X, y): cnb1 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', ComplementNB(alpha=1.353))]) cnb2 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', ComplementNB(alpha=0.347))]) cnb3 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', ComplementNB(alpha=0.347, special=1))]) cnb4 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', ComplementNB(alpha=0.347, special=2))]) svc = Pipeline([ ('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced'), n_jobs=-1)), ]) lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial'))]) rf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=1000, max_depth=14, verbose=1, random_state=0, n_jobs=-1)), ]) knn = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', KNeighborsClassifier(n_neighbors=100, n_jobs=-1)), ]) # add base classifiers to test synergies model = VotingClassifier(estimators=[('cnb2', cnb2), ('svc', svc), ('lr', rf)], voting='hard', n_jobs=-1) scores = model_selection.cross_val_score(model, X, y, cv=4, scoring='accuracy', n_jobs=-1) print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
def classification(self, x_train: np.array, y_train: np.array) -> ComplementNB(): """This returns the ComplementNB Classification Model Arguments: x_train {np.array} -- The train data x y_train {np.array} -- The train data y Returns: ComplementNB -- It is a KNeighborsClassifier model """ clf = ComplementNB() clf.fit(x_train, y_train) return clf
def score_test(train_data, test_data, part, save_root): model_dict = { 'GaussianNB': GaussianNB(), 'MultinomialNB': MultinomialNB(), 'BernoulliNB': BernoulliNB(), 'ComplementNB': ComplementNB() } train_texts = list(train_data[part]) train_labels = list(train_data['gfi_label']) vectorizer = CountVectorizer(max_features=10000, min_df=5, stop_words='english').fit(train_texts) test_ids = list(test_data['id']) test_texts = list(test_data[part]) score_dict = {} for name in model_dict: score_dict[name] = get_probs(copy.deepcopy(model_dict[name]), vectorizer, train_texts, train_labels, test_texts) score = pd.DataFrame() score['id'] = test_ids for name in score_dict: probs = score_dict[name] score[name + '_0'] = [proba[0] for proba in probs] score[name + '_1'] = [proba[1] for proba in probs] score.to_csv(os.path.join(save_root, 'test.csv'), index=False)
def train(datafile=paths.get_dataset_path(name), model_file=paths.get_model_path(name) ): #settings.heading_classification_model_file): data = pd.read_csv(datafile) X, Y = data_prep(data, y=True) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20) clf = Pipeline([ ('tfidf', TfidfVectorizer( analyzer='word', ngram_range=(1, 2))), #(token_pattern=r'([a-zA-Z]|[0-9])+')), ('clf', ComplementNB(norm=True)) ]) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) #weights = eli5.formatters.as_dataframe.explain_weights_df(clf, feature_names=clf['tfidf'].get_feature_names(), top=10, target_names=y_test) #print(weights) #prediction = eli5.formatters.as_dataframe.explain_prediction_df(clf, X_test[0], feature_names=clf['tfidf'].get_feature_names(), target_names=y_test) #print(prediction) accuracy = accuracy_score(y_test, y_pred) print(accuracy) report = classification_report(Y, clf.predict(X)) print(report) with open(paths.result_path + name + '_CNB_report.txt', "w") as r: r.write(report) with open(model_file, "wb") as file: pickle.dump(clf, file)
def pickling(): ''' Creates and pickles both the vectorizer and model for use in prediction. Parameters ---------- None Returns ---------- None ''' wrangler = Data_Handler('data/cleaned_data.csv') stops = wrangler.stop_words df = wrangler.get_top_num(15) X = df['description'] y = df['variety'] vecto = TfidfVectorizer(stop_words=stops) X = vecto.fit_transform(df['description']) f = open('pickles/text_vec.pkl', 'wb') pickle.dump(vecto, f) model = ComplementNB() model.fit(X, y) m = open('pickles/model.pkl', 'wb') pickle.dump(model, m)
def get_classifier(vocabulary): ''' 需要将抽象的句子分类到某一个模板,这里是训练分类器 ''' # 准备数据集 x_train = [] y_train = [] root = "./Qdata/question/" filenames = [ filename for filename in os.listdir(root) if filename[0] == "【" ] for filename in filenames: label = int(filename[filename.index("【") + 1:filename.index("】")]) with open(root + filename, "r", encoding="utf-8") as f: sen_list = [line.strip() for line in f.readlines()] x_train += sen_list y_train += [label] * len(sen_list) x_train_array = np.zeros((len(x_train), len(vocabulary))) for row, sentence in enumerate(x_train): for col, voc in enumerate(vocabulary): if voc in sentence: x_train_array[row, col] = 1 classifier = ComplementNB() classifier.fit(x_train_array, y_train) return classifier
def naive_bayes(classifier, data, labels): x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.1) print(f"\nTraining {classifier} NB classifier... ") if classifier == "gaussian": from sklearn.naive_bayes import GaussianNB nb = GaussianNB() elif classifier == "multinomial": from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() elif classifier == "complement": from sklearn.naive_bayes import ComplementNB nb = ComplementNB() elif classifier == "bernoulli": from sklearn.naive_bayes import BernoulliNB nb = BernoulliNB() else: return # 5-fold print("5-fold accuracy:", get_k_fold_accuracy(nb, 10, data, labels)) # LOOCV # print("LOOCV accuracy:", get_loocv_accuracy(nb,data,labels)) # Train model with 80% data nb.fit(x_train, y_train) print(f"\nTesting {classifier} NB classifier... ") # get confusion matrix y_pred = nb.predict(x_test) print_confusion_matrix(y_test, y_pred) return nb
def allModel(self): #instancia os 4 modelos de naive bayes definidos pelo sklearn model1 = GaussianNB() model2 = BernoulliNB(binarize = True) model3 = MultinomialNB() model4 = ComplementNB() self.model = [model1, model2, model3, model4]
def _complementnb(*, train, test, x_predict=None, metrics, alpha=1.0, fit_prior=True, class_prior=None, norm=False): """For for info visit : https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html#sklearn.naive_bayes.ComplementNB """ model = ComplementNB(alpha=alpha, fit_prior=fit_prior, class_prior=class_prior, norm=norm) model.fit(train[0], train[1]) model_name = 'ComplementNB' y_hat = model.predict(test[0]) if metrics == 'f1_score': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard_score': accuracy = jaccard_score(test[1], y_hat) if metrics == 'accuracy_score': accuracy = accuracy_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def buildReturnModel(self): model = None if self.flavor: if self.exp_type == 'classification': if self.flavor == 'Bernoulli': model = BernoulliNB(**self.default_args) elif self.flavor == 'Categorical': model = CategoricalNB(**self.default_args) elif self.flavor == 'Complement': model = ComplementNB(**self.default_args) elif self.flavor == 'Gaussian': model = GaussianNB(**self.default_args) elif self.flavor == 'Multinomial': model = MultinomialNB(**self.default_args) else: raise ValueError( 'Naive bayes can only be used for classification problems!' ) else: raise ValueError( 'cannot build model because the flavor of Naive Bayes is unknown!' ) return model
def main(): start = time.time() load_stop_words('stop_words.txt') train_df = load_train(train_file_path) X = train_df.iloc[:, 0].values Y = train_df.iloc[:, 1].values text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', ComplementNB(alpha=0.5))]) i = 0 kf = KFold(n_splits=NFOLDS, shuffle=True) for train_index, test_index in kf.split(X): i += 1 print( "\n************************************* Running fold: %d/%d *****************************************\n" % (i, NFOLDS)) text_clf.fit(X[train_index], Y[train_index]) accuracy = text_clf.score(X[train_index], Y[train_index]) val_accuracy = text_clf.score(X[test_index], Y[test_index]) y_pred = text_clf.predict(X[test_index]) cm = confusion_matrix(Y[test_index], y_pred) print("accuracy: %f val_accuracy: %f\n" % (accuracy, val_accuracy)) print(cm) end = time.time() print('Model completed in %d seconds' % (end - start))
def findBestFitCluster(orphanCorpus, corpusCluster={}): """ Given a set of questions without a cluster and a set of other clusters, find the best cluster to put the orphaned questions Parameters: orphanCorpus (tagged_question_corpus.TaggedQuestionCorpus): corpus of the questions without a cluster. corpusCluster ({tagged_question_corpus.TaggedQuestionCorpus}): Object containing different clusters and their corpuses Returns: xxx """ # corpusCluster = { # "questions": [ 'and the moon too guys', 'lets show some or a lot of love for the moon!!' ], # "question_vectors": [[], []], # "clusterIds": [ '4', '4' ] # } # orphanCorpus = [ { # "id": 11, "question": 'Another one about the sun?', "question_vector": [] # }, # { # "id": 33, # "question": 'What is the distance from the sun though?', "question_vector": [] }, # { # "id": 37, # "question": 'what\'s the changing factors of the sun and moon together?', "question_vector": [] # } ] # Fit the Naive bayes model on existing clusters clf = ComplementNB() clf.fit(corpusCluster["question_vectors"], corpusCluster["clusterIds"]) predictions = clf.predict_proba( [doc["question_vector"] for doc in orphanCorpus])
def ComplementNB_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with Complement Nive Bayes...") complNB = ComplementNB() complNB.fit(train, train_labels) prediction = complNB.predict(test) utils.report_and_confmat(test_labels, prediction, "ComplementNB") score = complNB.score(test, test_labels) res["ComplementNB"] = { "model": complNB, "accuracy": score, "name": "ComplementNB" } print("Complement ended...") return score, complNB
def main(): # Iris or breast cancer dataset can be used too x, y = datasets.load_wine(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2405) # Multinomial Naive Bayes MNB = MultinomialNB() MNB.fit(x_train, y_train) mnb_accuracy = MNB.score(x_test, y_test) print(f"MultinomialNB accuracy is {round(mnb_accuracy, 4)}") # Gaussian Naive Bayes GNB = GaussianNB() GNB.fit(x_train, y_train) gnb_accuracy = GNB.score(x_test, y_test) print(f"GaussianNB accuracy is {round(gnb_accuracy, 4)}") # Complement Naive Bayes CNB = ComplementNB() CNB.fit(x_train, y_train) cnb_accuracy = CNB.score(x_test, y_test) print(f"ComplementNB accuracy is {round(cnb_accuracy, 4)}")
def get_optimal_values_ComplementNB(x_train, y_train, x_val, y_val): alphas = [x / 10 for x in range(0, 11)] fit_priors = [True, False] norms = [True, False] max_score = 0 optimal_fit_prior = True optimal_alpha = 1.0 optiomal_norm = False # Evaluamos para escoger el mejor parámetro for alpha in alphas: for fit_prior in fit_priors: for norm in norms: naive = ComplementNB(alpha=alpha, fit_prior=fit_prior, norm=norm) naive.fit(x_train, y_train) y_pred = naive.predict(x_val) if max_score < accuracy_score(y_val, y_pred) * 100: optimal_alpha = alpha optimal_fit_prior = fit_prior optiomal_norm = norm max_score = accuracy_score(y_val, y_pred) * 100 print(max_score, optimal_alpha, optimal_fit_prior, optiomal_norm) return max_score, optimal_alpha, optimal_fit_prior, optiomal_norm
def TextClassifier1(train_feature_list, test_feature_list, train_class_list, test_class_list): classifier = ComplementNB().fit(train_feature_list, train_class_list) test_accuracy = classifier.score(test_feature_list, test_class_list) # v = classifier.predict_proba(test_feature_list) # print(v) return test_accuracy
def realizar_treinamento(registros_de_treino, vetorizador): treino_comentarios = [ registro_treino[0] for registro_treino in registros_de_treino ] treino_respostas = [ registro_treino[1] for registro_treino in registros_de_treino ] treino_comentarios = vetorizador.fit_transform(treino_comentarios) # modelo = BernoulliNB() # modelo = MultinomialNB() modelo = ComplementNB() modelo.fit(treino_comentarios, treino_respostas) # VALIDAÇÃO COM CROSS VALIDATION # cv = KFold(n_splits=200) # resultado = cross_val_predict(modelo, treino_comentarios, treino_respostas, cv=cv) # total = len(resultado) # acc = 0 # # score = accuracy_score(treino_respostas, resultado) # print(score * 100) # # for i in range(0, total): # if resultado[i] == treino_respostas[i]: # acc += 1 # # print(acc, total, acc/total * 100) # # print(metrics.classification_report(treino_respostas, resultado, [0, 1])) # # exit() return modelo
def create_nb_classifier_pipeline(n_features): classifier = Pipeline([ ('features', FeatureUnion([ ('journal_title', Pipeline([ ('colext', JournalTitleSelector('journal')), ('tfidf', TfidfVectorizer(ngram_range=(1, 3), min_df=0.0005, max_df=0.6, strip_accents='ascii')), ])), ('article_title', Pipeline([('colext', TitleSelector('title')), ('tfidf', TfidfVectorizer(ngram_range=(1, 3), min_df=0.001, max_df=0.6, strip_accents='ascii', sublinear_tf=True))])), ('article_abstract', Pipeline([('colext', AbstractSelector('abstract')), ('tfidf', TfidfVectorizer(ngram_range=(1, 3), min_df=0.001, max_df=0.6, strip_accents='ascii', sublinear_tf=True))])), ])), ('feature_selection', SelectKBest(chi2, k=n_features)), ('clf', ComplementNB()) ]) return classifier
def BayesClassifier(dframe): #print(dframe.columns) dframe = dframe.drop(['x_7'], axis = 1) X = dframe[list(dframe.columns)[ : -1]][ : -20].to_numpy() y = dframe[list(dframe.columns)[-1 : ]][ : -20].to_numpy().reshape(len(dframe['y']) - 20, ) X_validate = dframe[list(dframe.columns)[ : -1]][-20 : ].to_numpy() y_validate = dframe[list(dframe.columns)[-1 : ]][-20 : ].to_numpy() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.08, random_state = 16) model = ComplementNB() cv_gen = ShuffleSplit(n_splits = 10, test_size = 0.25, random_state = 0) model_gs = GridSearchCV(model, { 'norm': [True, False] }, scoring = 'accuracy', n_jobs = -1, cv = cv_gen ) model_gs.fit(X, y) print(model_gs.best_params_) print("Accuracy score", model_gs.best_score_) #print(X_validate.shape) for i in range(X_validate.shape[0]): prediction = model_gs.best_estimator_.predict(X_validate[i].reshape(1, X_validate.shape[1])) print("Predicted:", prediction) print("Real:", y_validate[i]) print("") return 0
def _generate_title_model(data_train, labels_train, output_file): print('Training reference title model for recommendation...') title_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2))), ('clf', ComplementNB())]) title_clf.fit(data_train, labels_train) joblib.dump(title_clf, output_file, compress='zlib') print('Model file {} saved.'.format(output_file))
def train(self): """ """ print("=== Training on %d papers ===" % (len(self.train_papers))) if self.distribution == "Multinomial": self.clf = MultinomialNB() self.clf.fit(self.train_X, self.train_y) elif self.distribution == "Gaussian": self.clf = GaussianNB() self.clf.fit(self.train_X, self.train_y) elif self.distribution == "Complement": self.clf = ComplementNB() self.clf.fit(self.train_X, self.train_y) elif self.distribution == "Bernoulli": self.clf = BernoulliNB() self.clf.fit(self.train_X, self.train_y) elif self.distribution == "Deep": self.clf = self.deepCNN() callback = ModelCheckpoint( "../models/weights.{epoch:02d}-{val_acc:.5f}.hdf5", monitor='val_acc', verbose=0, save_best_only=True, save_weights_only=True, mode='max', period=1) self.clf.fit(x=np.expand_dims(self.deep_train_X, axis=2), y=self.deep_train_y, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.1, callbacks=[callback])
def __init__(self, col_stats, data_type=None): """ Chose the algorithm to use for the rest of the model As of right now we go with ComplementNB """ # <--- Pick one of the 3 self._probabilistic_model = ComplementNB(alpha=self._smoothing_factor) #, class_prior=[0.5,0.5] #self._probabilistic_model = GaussianNB(var_smoothing=1) #self._probabilistic_model = MultinomialNB(alpha=self._smoothing_factor) self.X_buff = [] self.Y_buff = [] self.col_stats = col_stats if 'percentage_buckets' in col_stats: self.buckets = col_stats['percentage_buckets'] self.bucket_keys = [i for i in range(len(self.buckets))] else: self.buckets = None self.data_type = col_stats['data_type'] self.bucket_accuracy = { }
def result_for_classifiers(data, categories_list): # NAIVE BAYES classifiers(MultinomialNB(alpha=0.05), "Naive Bayes", data, categories_list, False) # COMPLEMENT NAIVE BAYES classifiers(ComplementNB(alpha=0.05), "Complement Naive Bayes", data, categories_list, False) # GAUSSIAN NAIVE BAYES classifiers(GaussianNB(), "Gaussian Naive Bayes", data, categories_list, False) # RANDOM FOREST classifiers(RandomForestClassifier(), "Random Forest", data, categories_list, False) # ADABOOST classifiers(AdaBoostClassifier(), "AdaBoost", data, categories_list, False) # KNN classifiers(KNeighborsClassifier(), "KNN", data, categories_list, False) # SVM classifiers(SVC(), "SVM", data, categories_list, False) # DECISION TREES classifiers(DecisionTreeClassifier(), "Decision Trees", data, categories_list, False) # NEURAL NETWORK classifiers(MLPClassifier(hidden_layer_sizes=2, random_state=0), "Multilayer Perceptron", data, categories_list, True)
def _fit(self, X, y, reset): self._model_clf = ComplementNB( ) if reset == True or self._model_clf == None else self._model_clf train_result = self._model_clf.fit(X, y) joblib.dump(self._model_clf, self.__model_path) self._dumpmodel() return train_result