def test_cnb():
    # Tests ComplementNB when alpha=1.0 for the toy example in Manning,
    # Raghavan, and Schuetze's "Introduction to Information Retrieval" book:
    # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html

    # Training data points are:
    # Chinese Beijing Chinese (class: China)
    # Chinese Chinese Shanghai (class: China)
    # Chinese Macao (class: China)
    # Tokyo Japan Chinese (class: Japan)

    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo.
    X = np.array([[1, 1, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0],
                  [0, 1, 0, 1, 0, 0],
                  [0, 1, 1, 0, 0, 1]])

    # Classes are China (0), Japan (1).
    Y = np.array([0, 0, 0, 1])

    # Verify inputs are nonnegative.
    clf = ComplementNB(alpha=1.0)
    assert_raises(ValueError, clf.fit, -X, Y)

    clf.fit(X, Y)

    # Check that counts are correct.
    feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]])
    assert_array_equal(clf.feature_count_, feature_count)
    class_count = np.array([3, 1])
    assert_array_equal(clf.class_count_, class_count)
    feature_all = np.array([1, 4, 1, 1, 1, 1])
    assert_array_equal(clf.feature_all_, feature_all)

    # Check that weights are correct. See steps 4-6 in Table 4 of
    # Rennie et al. (2003).
    theta = np.array([
        [
            (0 + 1) / (3 + 6),
            (1 + 1) / (3 + 6),
            (1 + 1) / (3 + 6),
            (0 + 1) / (3 + 6),
            (0 + 1) / (3 + 6),
            (1 + 1) / (3 + 6)
        ],
        [
            (1 + 1) / (6 + 6),
            (3 + 1) / (6 + 6),
            (0 + 1) / (6 + 6),
            (1 + 1) / (6 + 6),
            (1 + 1) / (6 + 6),
            (0 + 1) / (6 + 6)
        ]])

    weights = np.zeros(theta.shape)
    for i in range(2):
        weights[i] = np.log(theta[i])
        weights[i] /= weights[i].sum()

    assert_array_equal(clf.feature_log_prob_, weights)
Example #2
0
    test_docs   = preprocess(test_docs)

    
    # create a vectorizer object
    tfidf_vectorizer = TfidfVectorizer(
        analyzer        = "word",
        stop_words      = stopwords.words('english'),
        max_df          = 0.7,
        max_features    = 10000)
    
    # create sparse matrix representation of documents
    vect_train_docs   = tfidf_vectorizer.fit_transform(train_docs)
    vect_test_docs    = tfidf_vectorizer.transform(test_docs)
    
    # classifier
    classifier = OneVsRestClassifier(ComplementNB())
    classifier.fit(vect_train_docs, train_labels)
 
    # get predictions using trained classifier
    predictions = classifier.predict(vect_test_docs)  

    # metrics    
    precision = precision_score(test_labels, predictions, average='micro')
    recall = recall_score(test_labels, predictions, average='micro')
    f1 = f1_score(test_labels, predictions, average='micro')
    
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
    
    precision = precision_score(test_labels, predictions, average='macro')
    recall = recall_score(test_labels, predictions, average='macro')
Example #3
0
y_pred_DTC_gini = clf_gini.fit(X_train, y_train).predict(X_test)
y_pred_DTC_entropy = clf_entropy.fit(X_train, y_train).predict(X_test)
print("Accuracy of DTC (gini): ",
      metrics.accuracy_score(y_test, y_pred_DTC_gini) * 100)
print("Accuracy of DTC (entropy):",
      metrics.accuracy_score(y_test, y_pred_DTC_entropy) * 100)

#KNN
knn = KNeighborsClassifier(n_neighbors=25)
y_pred_knn = knn.fit(X_train, y_train).predict(X_test)
print("Accuracy of KNN:", metrics.accuracy_score(y_test, y_pred_knn) * 100)

#NAIVE BAYES
gnb = GaussianNB()
mnb = MultinomialNB()
cnb = ComplementNB()

y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test)
y_pred_mnb = mnb.fit(X_train, y_train).predict(X_test)
y_pred_cnb = cnb.fit(X_train, y_train).predict(X_test)

print("Accuracy of GNB:", metrics.accuracy_score(y_test, y_pred_gnb) * 100)
print("Accuracy of MNB:", metrics.accuracy_score(y_test, y_pred_mnb) * 100)
print("Accuracy of CNB:", metrics.accuracy_score(y_test, y_pred_cnb) * 100)

#Logistic Regression
lr = LogisticRegression(random_state=0)
y_pred_lc = lr.fit(X_train, y_train).predict(X_test)
print("Accuracy of Logistic Regression:",
      metrics.accuracy_score(y_test, y_pred_lc) * 100)
print('=' * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(
    benchmark(
        Pipeline([
            ('feature_selection',
             SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
            ('classification', LinearSVC(penalty="l2"))
        ])))

# make some plots
###################### mnb ############################################--Code from ASTD
classifier = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
classifier.fit(X_train, Y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)
total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2]
total_predictions_made = np.sum(cm)
accuracy = total_correct_predictions / total_predictions_made * 100

##################### CNB ######################
from sklearn.naive_bayes import ComplementNB
classifier = ComplementNB()
#ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_test)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)
total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2]
total_predictions_made = np.sum(cm)
accuracy = total_correct_predictions / total_predictions_made * 100
#print(clf.predict(X[2:3]))

################# sgd ###############################################--Code from ASTD
classifier = SGDClassifier(loss="hinge", penalty="l2")
classifier.fit(X_train, Y_train)
# Predicting the Test set results
Example #6
0
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}


def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise"""
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(tweet)['compound'] > 0
Example #7
0
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB, ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer
from sklearn.metrics import brier_score_loss, roc_auc_score, recall_score
from time import time
import datetime


X, y = make_blobs(n_samples=[50000, 500],
                  centers=[[0.0, 0.0], [5.0, 5.0]],
                  cluster_std=[3, 1],
                  random_state=0, shuffle=False
                  )

name = ["Multinomial","Gaussian","Bernoulli","Complement"]
models = [MultinomialNB(),GaussianNB(),BernoulliNB(),ComplementNB()]

for name, clf in zip(name,models):
    times = time()
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3,random_state=420)
    #预处理    
    if name!= "Gaussian":
        kbs = KBinsDiscretizer(n_bins=10, encode='onehot').fit(Xtrain)
        Xtrain = kbs.transform(Xtrain)
        Xtest = kbs.transform(Xtest)
    clf.fit(Xtrain,Ytrain)
    y_pred = clf.predict(Xtest)
    proba = clf.predict_proba(Xtest)[:,1]
    score = clf.score(Xtest,Ytest)

    print(name)
from sklearn import tree
import numpy
import pandas
import matplotlib.pyplot as plt
import graphviz
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB, ComplementNB, MultinomialNB

data = pandas.read_csv('iris.csv', header=None)
Y = numpy.asarray(data[data.columns[-1]])
X = numpy.asarray(data[data.columns[0:-1]])
clf = tree.DecisionTreeClassifier(max_depth=4)
GNB = GaussianNB()
MNB = MultinomialNB()
CNB = ComplementNB()

print('clf')
scores = cross_val_score(clf, X, Y, cv=5)
print(scores)
clf.fit(X, Y)
print(clf.score(X, Y))

print('GNB')
scores = cross_val_score(GNB, X, Y, cv=5)
print(scores)
GNB.fit(X, Y)
print(GNB.score(X, Y))

print('MNB')
scores = cross_val_score(MNB, X, Y, cv=5)
print(scores)
Example #9
0
def fit():
    # TODO: test content type and send 400 if not JSON
    # Construct the model fit request
    data = request.get_json()
    params = data.get("model", {})
    dataset = data.get("dataset", [])
    grid = data.get("grid", [])
    model = {
        'gaussiannb': GaussianNB(),
        'multinomialnb': MultinomialNB(),
        'bernoullinb': BernoulliNB(),
        'complementnb': ComplementNB(),
        'svm': SVC(),
        'logit': LogisticRegression(),
    }.get(params.pop("model", None), None)

    # Validate the request is correct and sane
    if model is None or len(dataset) == 0:
        return "invalid fit request: please specify model and data", 400

    # Parse the JSON hyperparameters or leave as string for type detection
    for key in params.keys():
        try:
            params[key] = json.loads(params[key])
        except json.decoder.JSONDecodeError:
            continue

    # Set the hyperparameters on the model
    try:
        model.set_params(**params)
    except ValueError as e:
        return str(e), 400

    # Construct the dataset
    X, y = [], []
    for point in dataset:
        X.append([point["x"], point["y"]])
        y.append(point["c"])
    X, y = asarray(X), asarray(y)

    # Fit the model to the dataset and get the training score
    model.fit(X, y)
    yhat = model.predict(X)
    metrics = prfs(y, yhat, average="macro")

    # Make probability predictions on the grid to implement contours
    # The returned value is the class index + the probability
    # To get the selected class in JavaScript, use Math.floor(p)
    # Where p is the probability returned by the grid. Note that this
    # method guarantees that no P(c) == 1 to prevent class misidentification
    Xp = asarray([
        [point["x"], point["y"]] for point in grid
    ])
    preds = []
    for proba in model.predict_proba(Xp):
        c = np.argmax(proba)
        preds.append(float(c+proba[c])-0.000001)

    return jsonify({
        "metrics": dict(zip(["precision", "recall", "f1", "support"], metrics)),
        "grid": preds,
    })
Example #10
0
guassian_predictions = model.predict(X2_test)
print("Guassian results:")
print(confusion_matrix(y_test, guassian_predictions))
print("\n")
#Performs much worse than Multinomial NB - leave this here.

model = MultinomialNB()
model.fit(X1_train, y_train)
model.predict(X1_test)
nb_prediction = model.predict(X1_test)
print("Naive Bays TFIDF results:")
print(confusion_matrix(y_test, nb_prediction))
print("\n")
#Using TFIDF reduces accuracy - as expected. Continue with normal data

model = ComplementNB()
model.fit(X2_train, y_train)
model.predict(X2_test)
nb_prediction = model.predict(X2_test)
print("Complement NB results:")
print(confusion_matrix(y_test, nb_prediction))
print("\n")
#Complement NB works as well as multinomial and distribution seems more equal
#This makes sense as its design is for imbalanced datasets

model = svm.SVC(decision_function_shape='ovo')
model.fit(X2_train, y_train)
model.predict(X2_test)
nb_prediction = model.predict(X2_test)
print("SVM results:")
print(confusion_matrix(y_test, nb_prediction))
Example #11
0
# ******************* Decision Tree Classifier (Gini) ************************
model = tree.DecisionTreeClassifier()
run_model(model, 'Decision Tree')
# dot_data = tree.export_graphviz(model, out_file=None)
# graph = graphviz.Source(dot_data)
# graph.render("yelp_decision_tree")

# ******************* Gaussian Naive Bayes *************************
run_model(GaussianNB(), 'Gaussian Naive Bayes')

# ******************* Multinomial Naive Bayes ***********************
run_model(MultinomialNB(), 'Multinomial Naive Bayes')

# ******************* Complement Naive Bayes ************************
run_model(ComplementNB(), 'Complement Naive Bayes')

# ******************* Bernoulli Naive Bayes *************************
run_model(BernoulliNB(), 'Bernoulli Naive Bayes')

# ******************* KNN ************************
knn_range = range(3, 12)
for k in knn_range:
    print('Iteration: ' + str(k))
    knn_model = KNeighborsClassifier(n_neighbors=k)
    run_model(knn_model, ' KNN({} neighbors) '.format(k))

# ******************* SVM(linear) ************************
# svm_model = svm.SVC(kernel='linear')
# run_model(svm_model, ' SVM(Linear) ')
#
Example #12
0
    y_pred = model.fit(X_train, y_train).predict(X_test)
    print("error rate dev GaussianNB {0}".format(1-(y_test != y_pred).sum()/y_test.shape[0] ))

    task = 'MultinomialNB'
    model = MultinomialNB()
    y_pred = model.fit(X_train, y_train).predict(X_test)
    print("error rate dev MultinomialNB {0}".format(1-(y_test != y_pred).sum()/y_test.shape[0] ))

    task = 'BernoulliNB'
    model = BernoulliNB()
    y_pred = model.fit(X_train, y_train).predict(X_test)
    print("error rate dev BernoulliNB {0}".format(1-(y_test != y_pred).sum()/y_test.shape[0] ))
    """

    task = 'ComplementNB'
    model = ComplementNB()
    y_pred = model.fit(X_train, y_train).predict(X_test)
    print(
        "error rate dev ComplementNB {0}".format(1 - (y_test != y_pred).sum() /
                                                 y_test.shape[0]))
elif (sys.argv[1] == 'D'):
    from sklearn import tree
    task = 'DecisionTree'
    model = tree.DecisionTreeClassifier(criterion='gini',
                                        max_depth=80,
                                        min_samples_split=2,
                                        min_samples_leaf=2,
                                        min_weight_fraction_leaf=0.0,
                                        random_state=None,
                                        max_leaf_nodes=200,
                                        class_weight=None)
Example #13
0
    def test_complement_nb(self):
        self.check_model(ComplementNB(), abs=True)

        model_name = 'complement-nb.json'
        self.check_model_json(ComplementNB(), model_name, abs=True)
Example #14
0
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import RadiusNeighborsClassifier

from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

now = datetime.datetime.now()
time_stamp = now.strftime("%Y_%b_%d_%H_%M")

print('Training Stamp:' + time_stamp)
mnb = MultinomialNB(alpha=0.01)
bnb = BernoulliNB()
gnb = GaussianNB()
cnb = ComplementNB()

svc = SGDClassifier(max_iter=1000, tol=1e-3, fit_intercept=True)

lda = LinearDiscriminantAnalysis(solver='svd')

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
nc = NearestCentroid()
rnc = RadiusNeighborsClassifier(n_jobs=-1)

lpg = LabelPropagation(n_jobs=-1)
lps = LabelSpreading(n_jobs=-1)

dct = DecisionTreeClassifier(class_weight='balanced',
                             criterion='entropy',
                             random_state=9)
Example #15
0
    name="NearestCentroid (aka Rocchio classifier)"
)

# Train sparse Naive Bayes classifiers
benchmark(
    MultinomialNB(alpha=.01),
    name="Naive Bayes MultinomialNB"
)

benchmark(
    BernoulliNB(alpha=.01),
    name="Naive Bayes BernoulliNB"
)

benchmark(
    ComplementNB(alpha=.1),
    name="Naive Bayes ComplementNB"
)

# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
benchmark(
    Pipeline([
        ('feature_selection',
            SelectFromModel(
                LinearSVC(
                    penalty="l1",
                    dual=False,
                    tol=1e-3
                )
            )),
Example #16
0
def plot_ml_model(X, y, fold):
    pyplot.close('all')
    #print ("Enter")
    #algos = ["SVM-linear","SVM-Kernel","GaussianNB","BernoulliNB","ComplementNB","DTree-gini","DTree-entropy","RF-50","RF-100","RF-150", "KNN-2", "KNN-6"]

    algos = [
        "SVM-linear", "SVM-Kernel", "GaussianNB", "ComplementNB", "DTree-gini",
        "DTree-entropy", "RF-50", "RF-100", "KNN-2", "KNN-6"
    ]

    clfs = [
        SVC(kernel='linear'),
        SVC(kernel='rbf'),
        GaussianNB(),
        #BernoulliNB(),
        ComplementNB(),
        DecisionTreeClassifier(criterion="entropy",
                               max_depth=24,
                               min_samples_split=2),
        DecisionTreeClassifier(criterion="entropy",
                               max_depth=24,
                               min_samples_split=2),
        RandomForestClassifier(n_estimators=50),
        RandomForestClassifier(n_estimators=100),
        #RandomForestClassifier(n_estimators = 150),
        KNeighborsClassifier(n_neighbors=2),
        KNeighborsClassifier(n_neighbors=6)
    ]

    cv_results = []

    scoring = 'accuracy'
    #scoring = 'roc_auc'
    for classifiers in clfs:
        cv_score = cross_val_score(classifiers, X, y, cv=fold, scoring=scoring)
        cv_results.append(cv_score.mean())

    cv_mean = pd.DataFrame(cv_results, index=algos)
    cv_mean.columns = ["Accuracy"]
    print(cv_mean.sort_values(by="Accuracy", ascending=False))
    cv_mean.plot.bar(figsize=(10, 5))

    #scatter plot
    scores = cv_mean["Accuracy"]
    #create traces
    trace1 = go.Scatter(x=algos,
                        y=scores,
                        name='Algortms Name',
                        marker=dict(color='rgba(0,255,0,0.5)',
                                    line=dict(color='rgb(0,0,0)', width=2)),
                        text=algos)
    data = [trace1]

    layout = go.Layout(barmode="group",
                       xaxis=dict(title='ML Algorithms',
                                  ticklen=5,
                                  zeroline=False),
                       yaxis=dict(title='Prediction Scores',
                                  ticklen=5,
                                  zeroline=False))
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    pyplot.show()
Example #17
0
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = Op(**self._hyperparams)
    vocab = vectorizer.get_feature_names()
    print('trained and transformed w/ vectorizer')
    dump(vectorizer, 'vectorizer.joblib')

    # model training
    log_reg = LogisticRegression()
    log_reg.fit(train_data_features, y_train)
    lr_preds = log_reg.predict(test_data_features)

    # keep the knn, it's the best
    knn = KNeighborsClassifier()
    knn.fit(train_data_features, y_train)
    knn_preds = knn.predict(test_data_features)
    dump(knn, 'knn.joblib')

    cnb = ComplementNB()
    cnb.fit(train_data_features, y_train)
    cnb_preds = cnb.predict(test_data_features)

    # make df with all preds
    df = pd.DataFrame(
        list(zip(cnb_preds, lr_preds, knn_preds, y_test, x_test)),
        columns=['cnb_preds', 'lr_preds', 'knn_preds', 'category', 'document'])

    # save incorrect predictions in a df to look at
    lr_incorrect = df[df['lr_preds'] != df['category']].copy()
    knn_incorrect = df[df['knn_preds'] != df['category']].copy()
    cnb_incorrect = df[df['cnb_preds'] != df['category']].copy()

    # combine lr and knn incorrects
    two_incorrect = knn_incorrect[
"""###**Multinomial Naive Bayes**"""

kfold = KFold(n_splits=10)
MNB = MultinomialNB()
results = cross_val_score(MNB, X_train_smote.todense(), y_train_smote, cv=kfold, scoring='accuracy')
print("Training Accuracy: %.3f" % (results.mean()*100.0))
MNB.fit(X_train_smote.todense(), y_train_smote)
predictions = MNB.predict(X_test)
print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0))
print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('Classification report\n', classification_report(y_test, predictions))

"""###**Complement Naive Bayes**"""

kfold = KFold(n_splits=10)
CNB = ComplementNB()
results = cross_val_score(CNB, X_train_smote.todense(), y_train_smote, cv=kfold, scoring='accuracy')
print("Training Accuracy: %.3f" % (results.mean()*100.0))
CNB.fit(X_train_smote.todense(), y_train_smote)
predictions = CNB.predict(X_test)
print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0))
print('Confusion Matrix:\n',confusion_matrix(y_test, predictions))
print('Classification report\n', classification_report(y_test, predictions))

"""###**Bernoulli Naive Bayes**"""

kfold = KFold(n_splits=10)
BNB = BernoulliNB()
results = cross_val_score(BNB, X_train_smote.todense(), y_train_smote, cv=kfold, scoring='accuracy')
print("Training Accuracy: %.3f" % (results.mean()*100.0))
BNB.fit(X_train_smote.todense(), y_train_smote)
pickle.dump(mnb_model, open("mnb_model_tfidf.sav", 'wb'))
pred_mnb = mnb_model.predict(q_test)
#evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations
#use multilabel_evaluation()
mnb_evaluation_scores, mnb_cm = evaluation.multilabel_evaluation_multilabelbinarizer(
    d_test, label_encoder.inverse_transform(pred_mnb),
    "Multinomial Naive Bayes")
#mnb_evaluation_scores, mnb_cm = evaluation.multilabel_evaluation(
#    d_test, label_encoder.inverse_transform(pred_mnb), "Multinomial Naive Bayes")
documentation_file_modelopt.write(mnb_evaluation_scores)

# Complement Naive Bayes: optimizing parameters with grid search
print("Complement Naive Bayes model evaluation")
cnb_dict = dict(estimator__alpha=[1, 2, 5, 10, 50])
classifier_cnb = RandomizedSearchCV(estimator=OneVsRestClassifier(
    ComplementNB()),
                                    param_distributions=cnb_dict,
                                    n_iter=5,
                                    n_jobs=1)
classifier_cnb.fit(q_train, d_train_encoded)
documentation_file_parameteropt.write(
    "Complement Naive Bayes: Best parameters {}, reached score: {} \n".format(
        classifier_cnb.best_params_, classifier_cnb.best_score_))
cnb_model = classifier_cnb.best_estimator_
pickle.dump(cnb_model, open("cnb_model_tfidf.sav", 'wb'))
pred_cnb = cnb_model.predict(q_test)
#evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations
#use multilabel_evaluation()
cnb_evaluation_scores, cnb_cm = evaluation.multilabel_evaluation_multilabelbinarizer(
    d_test, label_encoder.inverse_transform(pred_cnb),
    "Complement Naive Bayes")
Example #21
0
def iterate_by_randomsearch(train_x, train_y):
    classifiers = [
        # thsis is for anomaly detection (IsolationForest(),{"n_estimators":50,
        #                     "contamination":np_uniform(0., 0.5),
        #                     "behaviour":["old", "new"],
        #                     "bootstrap": [True, False],
        #                     "max_features": sp.stats.randint(1, 7),
        #                     "min_samples_split": sp.stats.randint(2, 11)}),
        # this is for outlier detection (RadiusNeighborsClassifier(), {"radius": sp.stats.uniform(0.5, 5),
        #                                "algorithm": ["ball_tree", "kd_tree", "brute"],
        #                                "leaf_size": sp.stats.randint(20, 100),
        #                                "p": [1, 2]})
        (AdaBoostClassifier(), {
            "n_estimators": sp.stats.randint(25, 100)
        }),
        (BaggingClassifier(), {
            "n_estimators": sp.stats.randint(25, 100),
            "max_features": sp.stats.randint(1, 7),
            "bootstrap": [True, False],
            "bootstrap_features": [True, False],
        }),
        (ExtraTreesClassifier(), {
            "n_estimators": sp.stats.randint(25, 100),
            "max_depth": [3, None],
            "max_features": sp.stats.randint(1, 7),
            "min_samples_split": sp.stats.randint(2, 11),
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        }),
        (GradientBoostingClassifier(), {
            "n_estimators": sp.stats.randint(25, 100),
            "loss": ["deviance", "exponential"],
            "max_features": sp.stats.randint(1, 7),
            "min_samples_split": sp.stats.randint(2, 11),
            "criterion": ["friedman_mse", "mse", "mae"],
            "max_depth": [3, None]
        }),
        (RandomForestClassifier(), {
            "n_estimators": sp.stats.randint(25, 100),
            "max_depth": [3, None],
            "max_features": sp.stats.randint(1, 7),
            "min_samples_split": sp.stats.randint(2, 11),
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        }),
        (GaussianProcessClassifier(), {}),
        (LogisticRegression(), {
            "max_iter": sp.stats.randint(0, 100),
            "solver": ["lbfgs", "sag", "saga"]
        }),
        (PassiveAggressiveClassifier(), {
            "max_iter": sp.stats.randint(0, 1230),
            "tol": sp.stats.uniform(0.0001, 0.05)
        }),
        (RidgeClassifier(), {
            "max_iter": sp.stats.randint(0, 2000),
            "tol": sp.stats.uniform(0.0001, 0.05),
            "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
        }),
        (SGDClassifier(), {
            "max_iter":
            sp.stats.randint(0, 2000),
            "tol":
            sp.stats.uniform(0.0001, 0.05),
            "loss":
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            "penalty": ["none", "l2", "l1", "elasticnet"]
        }),
        (BernoulliNB(), {}),
        (MultinomialNB(), {}),
        (GaussianNB(), {}),
        (ComplementNB(), {}),
        (KNeighborsClassifier(), {
            "n_neighbors": sp.stats.randint(1, 50),
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": sp.stats.randint(20, 100),
            "p": [1, 2]
        }),
        (NearestCentroid(), {}),
        (MLPClassifier(), {
            "hidden_layer_sizes": (random.randint(10, 1000), ),
            "activation": ["identity", "logistic", "tanh", "relu"],
            "solver": ["lbfgs", "sgd", "adam"],
            "alpha": sp.stats.uniform(0.00001, 0.001),
            "learning_rate": ["constant", "invscaling", "adaptive"],
            "max_iter": sp.stats.randint(0, 2000),
            "tol": sp.stats.uniform(0.0001, 0.05)
        }),
        (DecisionTreeClassifier(), {
            "max_depth": [3, None],
            "max_features": sp.stats.randint(1, 7),
            "min_samples_split": sp.stats.randint(2, 11),
            "criterion": ["gini", "entropy"]
        }),
        (LinearSVC(), {
            "penalty": ["l2"],
            "tol": sp.stats.uniform(1e-5, 1e-3),
            "C": sp.stats.uniform(0.1, 5),
            "max_iter": sp.stats.randint(0, 2000)
        }),
        (NuSVC(), {
            "gamma": sp.stats.uniform(1e-4, 1e-2),
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "tol": sp.stats.uniform(1e-4, 1e-2),
        }),
        (SVC(), {
            "gamma": sp.stats.uniform(1e-4, 1e-2),
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "tol": sp.stats.uniform(1e-4, 1e-2),
        }),
        (LinearDiscriminantAnalysis(), {
            "solver": ["svd", "lsqr", "eigen"],
            "n_components": random.randint(2, 4),
            "tol": sp.stats.uniform(1e-5, 1e-2)
        }),
        (QuadraticDiscriminantAnalysis(), {
            "tol": sp.stats.uniform(1e-5, 1e-2)
        })
    ]
    df = pd.DataFrame(
        columns=['alg', 'perf', 'est', 'rank', 'mean', 'std', 'parameters'])
    for clf in classifiers:
        print(type(clf[0]).__name__, "started at", datetime.now())
        n_iter = 10
        random_search = RandomizedSearchCV(clf[0],
                                           param_distributions=clf[1],
                                           n_iter=n_iter,
                                           cv=5)
        start = time()
        random_search.fit(train_x, train_y)
        #print("%s RandomizedSearchCV took %.2f seconds for %d candidates"
        #      " parameter settings." % (type(clf[0]).__name__,(time() - start), n_iter))

        df = report(df,
                    type(clf[0]).__name__,
                    time() - start, n_iter, random_search.cv_results_)
    print(df)
def container(train_path, test_path, test_original_path):
    """

    :param train_path: training set path
    :param test_path: evaluation set path
    :param test_original_path: evaluation set original path
    :return: None
    """
    train_features = pd.read_csv(train_path)
    dev_features = pd.read_csv(test_path)

    classifier_container = [
        {"model": BernoulliNB(alpha=1.0e-10),
         "name": "BernoulliNB",
         "params": {
             "n_estimators": [102],
             "max_samples": [0.3],
             "max_features": [0.5],
             # "n_estimators": range(102, 106, 2),
             # "max_samples": linspace(0.2, 0.4, 3),
             # "max_features": linspace(0.3, 0.5, 3),
             "warm_start": [True]
         }},
        {"model": ComplementNB(alpha=1.0e-10),
         "name": "ComplementNB",
         "params": {
             "n_estimators": [106],
             "max_samples": [0.2],
             "max_features": [0.5],
             # "n_estimators": range(102, 104, 2),
             # "max_samples": linspace(0.1, 0.3, 3),
             # "max_features": linspace(0.4, 0.6, 3),
             "warm_start": [True]
         }},
        {"model": MultinomialNB(alpha=1.0e-10),
         "name": "MultinomialNB",
         "params": {
             "n_estimators": [102],
             "max_samples": [0.3],
             "max_features": [0.5],
             # "n_estimators": range(100, 110, 2),
             # "max_samples": linspace(0.1, 0.5, 5),
             # "max_features": linspace(0.2, 0.6, 5),
             "warm_start": [True]
         }},
        {"model": RandomForestClassifier(
            n_estimators=106,
            criterion="entropy"
        ),
            "name": "RandomForestClassifier",
            "params": {
                "n_jobs": [-1]
            }
        },
        {"model": DecisionTreeClassifier(
            criterion="entropy"
        ),
            "name": "DecisionTreeClassifier",
            "params": {
            }}
    ]
    results = []
    for idx, clf in enumerate(classifier_container):
        logging.info("[*] ({1}/{2}) Training with {0} ...".format(clf["name"], idx + 1, len(classifier_container)))
        bag = BaggingClassifier(base_estimator=clf["model"])
        # grid = RandomizedSearchCV(bag, param_dist, cv=42, n_iter=300, scoring='accuracy', n_jobs=-1, verbose=2, refit=True)
        # grid = GridSearchCV(bag, clf["params"], cv=42, n_jobs=-1, verbose=0, refit=True)
        grid = GridSearchCV(bag, clf["params"], cv=42, scoring='accuracy', n_jobs=-1, verbose=0, refit=True)
        grid.fit(train_features.iloc[:, 1:-1], train_features['class'].to_list())
        res = grid.best_estimator_.predict_proba(dev_features.iloc[:, 1:-1])
        results.append(res)
        # clf["model"].fit(train_features.iloc[:, 1:-1], train_features['class'].to_list())
        # res = clf["model"].predict(dev_features.iloc[:, 1:-1])
        # grid.best_estimator_ = clf["model"]
        # grid.best_params_ = None
        # grid.scoring = None
        # grid.cv = None
        # accuracy, scores = my_score(res, test_original_path, True)
        accuracy, scores = my_score(argmax(res, axis=1), test_original_path, True)
        save_model(grid, accuracy, scores)
    ensemble_res = results[0]
    for i in range(1, len(results)):
        ensemble_res += results[i]
    ensemble_res = argmax(ensemble_res, axis=1).tolist()
    accuracy, scores = my_score(ensemble_res, test_original_path, True)
    print("[*] Accuracy: %f" % accuracy)
    pprint(scores)
    # save_model(grid, accuracy, scores)

    print()
Example #23
0
            print("\nBernoulliNB working...\n")
            BNB = BernoulliNB()
            BNB.fit(x_train, y_train.values.ravel())
            print("\nAccuracy Score:",
                  accuracy_score(y_test, BNB.predict(x_test)))
            print("Confusion Matrix:\n")
            print(confusion_matrix(y_test, BNB.predict(x_test)))

        if (sel == 3):
            #MultinomialNB Classifier
            print("\nMultinomialNB working...\n")
            MNB = MultinomialNB()
            MNB.fit(x_train, y_train.values.ravel())
            print("\nAccuracy Score:",
                  accuracy_score(y_test, MNB.predict(x_test)))
            print("Confusion Matrix:\n")
            print(confusion_matrix(y_test, MNB.predict(x_test)))

        if (sel == 4):
            #ComplementNB Classifier
            print("\nComplementNB working...\n")
            CNB = ComplementNB()
            CNB.fit(x_train, y_train.values.ravel())
            print("\nAccuracy Score:",
                  accuracy_score(y_test, CNB.predict(x_test)))
            print("Confusion Matrix:\n")
            print(confusion_matrix(y_test, CNB.predict(x_test)))

    if (opt == 4):
        break
# %%
#############################
# Complement Naive Bayes
#############################
from sklearn.naive_bayes import ComplementNB

# %%
tmp = np.unique(np.where(X_train > 0)[0])
X_train_NB = X_train[tmp, :]
Y_train_NB = Y_train[tmp]
# %%
tmp = np.unique(np.where(X_test > 0)[0])
X_test_NB = X_test[tmp, :]
Y_test_NB = Y_test[tmp]
# %%
clf = ComplementNB()
clf.fit(X_train_NB, Y_train_NB)

# %%
pred = clf.predict(X_test)
# %%
result_NB = np.array(
    [[np.sum(pred * Y_test),
      np.sum(pred * (1 - Y_test))],
     [np.sum((1 - pred) * Y_test),
      np.sum((1 - pred) * (1 - Y_test))]])

# %%
POD_NB = (result_NB[0, 0]) / (result_NB[0, 0] + result_NB[1, 0])
FAR_NB = 1 - (result_NB[0, 0]) / (result_NB[0, 0] + result_NB[0, 1])
Example #25
0
def test_cnb():
    # Tests ComplementNB when alpha=1.0 for the toy example in Manning,
    # Raghavan, and Schuetze's "Introduction to Information Retrieval" book:
    # https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html

    # Training data points are:
    # Chinese Beijing Chinese (class: China)
    # Chinese Chinese Shanghai (class: China)
    # Chinese Macao (class: China)
    # Tokyo Japan Chinese (class: Japan)

    # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo.
    X = np.array([[1, 1, 0, 0, 0, 0],
                  [0, 1, 0, 0, 1, 0],
                  [0, 1, 0, 1, 0, 0],
                  [0, 1, 1, 0, 0, 1]])

    # Classes are China (0), Japan (1).
    Y = np.array([0, 0, 0, 1])

    # Check that weights are correct. See steps 4-6 in Table 4 of
    # Rennie et al. (2003).
    theta = np.array([
        [
            (0 + 1) / (3 + 6),
            (1 + 1) / (3 + 6),
            (1 + 1) / (3 + 6),
            (0 + 1) / (3 + 6),
            (0 + 1) / (3 + 6),
            (1 + 1) / (3 + 6)
        ],
        [
            (1 + 1) / (6 + 6),
            (3 + 1) / (6 + 6),
            (0 + 1) / (6 + 6),
            (1 + 1) / (6 + 6),
            (1 + 1) / (6 + 6),
            (0 + 1) / (6 + 6)
        ]])

    weights = np.zeros(theta.shape)
    normed_weights = np.zeros(theta.shape)
    for i in range(2):
        weights[i] = -np.log(theta[i])
        normed_weights[i] = weights[i] / weights[i].sum()

    # Verify inputs are nonnegative.
    clf = ComplementNB(alpha=1.0)
    assert_raises(ValueError, clf.fit, -X, Y)

    clf.fit(X, Y)

    # Check that counts/weights are correct.
    feature_count = np.array([[1, 3, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1]])
    assert_array_equal(clf.feature_count_, feature_count)
    class_count = np.array([3, 1])
    assert_array_equal(clf.class_count_, class_count)
    feature_all = np.array([1, 4, 1, 1, 1, 1])
    assert_array_equal(clf.feature_all_, feature_all)
    assert_array_almost_equal(clf.feature_log_prob_, weights)

    clf = ComplementNB(alpha=1.0, norm=True)
    clf.fit(X, Y)
    assert_array_almost_equal(clf.feature_log_prob_, normed_weights)
class ProbabilisticValidator():
    """
    # The probabilistic validator is a quick to train model used for validating the predictions of our main model
    # It is fit to the results our model gets on the validation set
    """
    _smoothing_factor = 0.5  # TODO: Autodetermine smotthing factor depending on the info we know about the dataset
    _probabilistic_model = None
    _X_buff = None
    _Y_buff = None

    def __init__(self, col_stats, data_type=None):
        """
        Chose the algorithm to use for the rest of the model
        As of right now we go with ComplementNB
        """
        self._X_buff = []
        self._Y_buff = []
        self._predicted_buckets_buff = []
        self._real_buckets_buff = []

        self.col_stats = col_stats

        if 'percentage_buckets' in col_stats:
            self._probabilistic_model = MultinomialNB(
                alpha=self._smoothing_factor)

            self.buckets = col_stats['percentage_buckets']
            self.bucket_keys = [i for i in range(len(self.buckets))]

            if len(self.buckets) < 3:
                self._probabilistic_model = ComplementNB(
                    alpha=self._smoothing_factor)
        else:
            self._probabilistic_model = ComplementNB(
                alpha=self._smoothing_factor)

            self.buckets = None

        self.data_type = col_stats['data_type']

        self.bucket_accuracy = {}

    def register_observation(self,
                             features_existence,
                             real_value,
                             predicted_value,
                             hmd=None):
        """
        # Register an observation in the validator's internal buffers

        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param real_value: The real value/label for this prediction
        :param predicted_value: The predicted value/label
        :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
        """
        try:
            predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(
                predicted_value)
        except:
            predicted_value = None

        try:
            real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(
                str(real_value).replace(',', '.'))
        except:
            real_value = None

        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats, hmd)
            real_value_b = get_value_bucket(real_value, self.buckets,
                                            self.col_stats, hmd)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = X + features_existence
            self._X_buff.append(X)

            self._Y_buff.append(real_value_b)
            self._real_buckets_buff = self._Y_buff
            self._predicted_buckets_buff.append(predicted_value_b)

            # If no column is ignored, compute the accuracy for this bucket
            nr_missing_features = len(
                [x for x in features_existence if x is False or x is 0])
            if nr_missing_features == 0:
                if real_value_b not in self.bucket_accuracy:
                    self.bucket_accuracy[real_value_b] = []
                self.bucket_accuracy[real_value_b].append(
                    int(real_value_b == predicted_value_b))
        else:
            predicted_value_b = predicted_value
            real_value_b = real_value
            self._X_buff.append(features_existence)
            self._Y_buff.append(real_value_b == predicted_value_b)
            self._real_buckets_buff.append(real_value_b)
            self._predicted_buckets_buff.append(predicted_value_b)

    def get_accuracy_histogram(self):
        x = []
        y = []

        total_correct = 0
        total_vals = 0

        buckets_with_no_observations = []
        for bucket in range(len(self.buckets)):
            try:
                total_correct += sum(self.bucket_accuracy[bucket])
                total_vals += len(self.bucket_accuracy[bucket])
                y.append(
                    sum(self.bucket_accuracy[bucket]) /
                    len(self.bucket_accuracy[bucket]))
            except:
                # If no observations were made for this bucket
                buckets_with_no_observations.append(bucket)
                y.append(None)

            x.append(bucket)

        validation_set_accuracy = total_correct / total_vals
        for bucket in buckets_with_no_observations:
            y[x.index(bucket)] = validation_set_accuracy

        return {'buckets': x, 'accuracies': y}, validation_set_accuracy

    def partial_fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')

        if self.buckets is not None:
            self._probabilistic_model.partial_fit(self._X_buff,
                                                  self._Y_buff,
                                                  classes=self.bucket_keys)
        else:
            self._probabilistic_model.partial_fit(self._X_buff,
                                                  self._Y_buff,
                                                  classes=[True, False])

        np.seterr(divide=log_types['divide'])

        self._X_buff = []
        self._Y_buff = []

    def fit(self):
        """
        # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet
        """
        log_types = np.seterr()
        np.seterr(divide='ignore')
        self._probabilistic_model.fit(self._X_buff, self._Y_buff)
        np.seterr(divide=log_types['divide'])

        self._X_buff = []
        self._Y_buff = []

    def get_confusion_matrix(self):
        matrix = confusion_matrix(self._real_buckets_buff,
                                  self._predicted_buckets_buff)
        return matrix

    def evaluate_prediction_accuracy(self, features_existence, predicted_value,
                                     always_use_model_prediction):
        """
        # Fit the probabilistic validator on an observation    def evaluate_prediction_accuracy(self, features_existence, predicted_value):
        :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
        :param predicted_value: The predicted value/label
        :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
        """
        if self.buckets is not None:
            predicted_value_b = get_value_bucket(predicted_value, self.buckets,
                                                 self.col_stats)
            X = [False] * (len(self.buckets) + 1)
            X[predicted_value_b] = True
            X = [X + features_existence]
        else:
            X = [features_existence]

        distribution = self._probabilistic_model.predict_proba(np.array(X))[0]
        distribution = distribution.tolist()

        if len([x for x in distribution if x > 0.01]) > 4:
            # @HACK
            mean = np.mean(distribution)
            std = np.std(distribution)

            distribution = [x if x > (mean - std) else 0 for x in distribution]

            sum_dist = sum(distribution)
            distribution = [x / sum_dist for x in distribution]

            min_val = min([x for x in distribution if x > 0.001])
            distribution = [
                x - min_val if x > min_val else 0 for x in distribution
            ]

            sum_dist = sum(distribution)
            distribution = [x / sum_dist for x in distribution]
            # @HACK
        else:
            pass

        return ProbabilityEvaluation(self.buckets, distribution,
                                     predicted_value,
                                     always_use_model_prediction)
Example #27
0
    df = df.drop(columns=feat, axis=1)
    dummy = None

# split data to train, heldout, and test datasets
print('INFO: Spliting data into train/heldout/test datasets.')
x_train = np.array(df[df['data'] == 'T'].drop(columns=['data', 'result']))
y_train = np.array(df[df['data'] == 'T']['result'].astype('bool'))
x_valid = np.array(df[df['data'] == 'V'].drop(columns=['data', 'result']))
y_valid = np.array(df[df['data'] == 'V']['result'].astype('bool'))
x_hold = np.array(df[df['data'] == 'H'].drop(columns=['data', 'result']))
y_hold = np.array(df[df['data'] == 'H']['result'].astype('bool'))

# machine learning classification models
classif = [('Gaussian Naive Bayes', GaussianNB()),
           ('Bernoulli Naive Bayes', BernoulliNB()),
           ('COmplement Naive Bayes', ComplementNB()),
           ('Multinomial Naive Bayes', MultinomialNB()),
           ('LOGistic Regression',
            LogisticRegression(solver='liblinear',
                               multi_class='ovr',
                               penalty='l2',
                               random_state=24)),
           ('LOGistic Regression 2',
            LogisticRegression(solver='saga',
                               multi_class='ovr',
                               l1_ratio=0.3,
                               penalty='elasticnet',
                               max_iter=1000,
                               random_state=24)),
           ('LOGistic Regression 3',
            LogisticRegression(solver='saga',
X, y = digits.data, digits.target
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.3, random_state=420)

# 定义高斯贝叶斯
gnb = GaussianNB().fit(Xtrain, Ytrain)
# 查看分数
acc_score = gnb.score(Xtest, Ytest)
# 查看预测结果
Y_pred = gnb.predict(Xtest)
# 混淆矩阵
cm = CM(Ytest, Y_pred)

# 看高斯贝叶斯适用的数据集
h = .02
names = ["Multinomial", "Gaussian", "Bernoulli", "Complement"]
classifiers = [MultinomialNB(), GaussianNB(), BernoulliNB(), ComplementNB()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]
figure = plt.figure(figsize=(6, 9))
i = 1
for ds_index, ds in enumerate(datasets):
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
    x1_min, x1_max = X[:, 0].min() - .5, X[:, 0].max() + .5
Example #29
0
class Classifier:
    def __init__(self, max_df=0.80, max_features=6500):
        self.count_vect = TfidfVectorizer(max_df=max_df,
                                          stop_words='english',
                                          max_features=max_features,
                                          use_idf=True)
        self.cnb = ComplementNB()
        np.random.seed(2222)

    def __fit(self):
        self.cnb.fit(self.x_train, self.train_set['category'])

    # Calling this method just after object creation is required in order to set up data
    # Attribute test_size specifies the magnitude of the test set
    def set_data(self, dataset: pd.DataFrame, labels: list, test_size=0.25):
        self.train_set, self.test_set = train_test_split(dataset,
                                                         test_size=test_size)
        self.x_train = self.count_vect.fit_transform(self.train_set['text'])
        self.labels = labels
        self.__fit()

    # This method returns the predicted label for the text provided
    def predict(self, text: str):
        txt = TextTools()
        text = txt.preprocess(text)
        feats = self.count_vect.transform([text])
        return self.cnb.predict(feats)

    # This method returns a matrix of probabilities computet by Complement Naive Bayes
    def get_predict_proba(self, text: str):
        feats = self.count_vect.transform([text])
        predictions = {
            'label': (self.cnb.predict(feats))[0],
            'features': self.cnb.predict_proba(feats)
        }
        return predictions

    # This method returns the f1-score
    def get_score(self):
        x_test = self.count_vect.transform(self.test_set['text'])
        y_test_pred = self.cnb.predict(x_test)
        return f1_score(self.test_set['category'],
                        y_test_pred,
                        average=None,
                        labels=self.labels).mean()

    # This method plots the confusion matrix
    def get_cmatrix(self):
        x_test = self.count_vect.transform(self.test_set['text'])
        y_test_pred = self.cnb.predict(x_test)
        disp = plot_confusion_matrix(self.cnb,
                                     x_test,
                                     self.test_set['category'],
                                     display_labels=self.labels,
                                     cmap=plt.cm.Blues,
                                     normalize='true')
        plt.show()

    # This method computes the cosine similarity between item1 and item2
    # item[1,2] must be array-like
    def similarity(self, item1, item2):
        return cosine(item1, item2)
Example #30
0
def ensemble_all_general(X, y, fold):
    models = []
    num_trees = 150
    seed = 7

    est1 = SVC(kernel='linear', gamma='auto', C=1.0)
    est2 = SVC(kernel='rbf', gamma='auto', C=1.0)
    est3 = GaussianNB()
    est4 = BernoulliNB()
    est5 = ComplementNB()
    est6 = DecisionTreeClassifier()
    est7 = DecisionTreeClassifier(criterion="entropy")
    est8 = RandomForestClassifier(n_estimators=50)
    est9 = KNeighborsClassifier(n_neighbors=6)
    est10 = BaggingClassifier(base_estimator=est6,
                              n_estimators=num_trees,
                              random_state=seed)
    est11 = AdaBoostClassifier(n_estimators=50, random_state=seed)
    est12 = GradientBoostingClassifier(n_estimators=150, random_state=seed)

    models.append(('SVM-1', est1))
    models.append(('SVM-2', est2))
    models.append(('NB-1', est3))
    models.append(('NB-2', est4))
    models.append(('NB-3', est5))
    models.append(('DT-1', est6))
    models.append(('DT-2', est7))
    #models.append(('RF-1', est8))
    models.append(('RF-2', RandomForestClassifier()))
    #models.append(('KNN-1', est9))
    models.append(('KNN-2', KNeighborsClassifier()))
    #models.append(('LDA', LinearDiscriminantAnalysis()))
    #models.append(('bagging', est10))
    #models.append(('adaboost', est11))
    #models.append(('gradboost', est12))

    #plot_ml_model(models)

    # evaluate each model in turn
    #seed = 7
    results = []
    names = []
    scoring = 'accuracy'
    for name, model in models:
        #ld = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     X,
                                                     y,
                                                     cv=fold,
                                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

    # boxplot algorithm comparison
    fig = pyplot.figure(figsize=(16, 16))
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    pyplot.boxplot(results)
    ax.set_xticklabels(names)
    pyplot.show()
Example #31
0
tee = config.Tee('../Results/%s/CNB_Regroup_%s%s_model.txt' % (config.args.dataset, config.args.tf_idf, config.args.remove_non), 'w')
from header_model_data import *
from sklearn.naive_bayes import ComplementNB

print("Regrouping the labels")
for i in range(len(yDF)):
    if(yDF[i] != pr.y_conversion('new-idea')):
        yDF[i] = pr.y_conversion('Non')
 
print("The Class Distribution is:")
classDist = Counter(yDF)
for k in classDist.keys():
    print("\t"+str(pr.conversion_y(k))+":"+str(classDist[k]))

print("Defining and doing a Complement Naive Bayes classifier for New-idea vs rest")

NB = ComplementNB()
scores = cross_validate(NB, xDF, yDF, cv=logo, scoring = scorer)

#print(scores)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
compute_stats(originalclass, predictedclass, True)
print("Confusion Matrix")

labelList = list(range(2))


print_cm(confusion_matrix(originalclass, predictedclass, labels = labelList), [pr.conversion_y(x) for x in labelList])

tee.close()