Example #1
0
from sklearn.metrics import accuracy_score
from features import make_Dictionary,extract_features

##Create the dictionary
training_dir = 'Train'
dictionary = make_Dictionary(training_dir)

##vectors
##0 stands for a SPAM
traning = np.zeros(139)
traning[88:139] = 1
train_matrix = extract_features(training_dir,dictionary)

# NB classifier
model1 = MultinomialNB()
model2 = GaussianNB()
model1.fit(train_matrix,traning)
model2.fit(train_matrix,traning)

# Test the unseen mails for SPAM
test_dir = 'TESTING_RESULT'
test_matrix = extract_features(test_dir,dictionary)
test = np.zeros(202)
test[180:202] = 1
result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)

print("MultinomialNB | HAM | SPAM")
print(confusion_matrix(test,result1))
##Calculate the accuracy score
print(accuracy_score(test, result1))
Example #2
0
# Menjadi dataset ke dalam Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Membuat model Naive Bayes terhadap Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Memprediksi hasil test set
y_pred = classifier.predict(X_test)

# Membuat confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualisasi hasil model Naive Bayes dari Training set
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
def evaluate(model, data, alg = None, classifier="lr",fast=False,ratio = None,cv=10,normalize=False,random_state = None,return_y = False):
    X = model
    Y = data
    micros = []
    macros = []
#    for y,key in enumerate(data.labels.keys()):
#        for index,paper in enumerate(data.labels[key]):
#            if paper not in model.paper2id:
#                print("paper not in model: ", paper)
#                continue
#            X.append(model.paper_embeddings[model.paper2id[paper]])
#            Y.append(y)
    print("len X: ", len(X))
    print("len Y: ", len(Y))
    if normalize:
        X = sk_normalize(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    clf = LogisticRegression()
    df = defaultdict(list)
    if ratio is None:
        ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] 
    for r in ratio:
        if r <= 0:
            continue
        elif r >= 1:
            break
            
        micros = []
        macros = []
        for i in range(cv):
            clf = LogisticRegression()
            if classifier.lower() == "svm":
                clf = SVC(cache_size=5000)
            elif classifier.lower() == "mlp":
                clf = MLPClassifier()
            elif classifier.lower() == "nb":
                clf = GaussianNB()

            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1-r,random_state=random_state)
            clf.fit(X_train,Y_train)
            prediction = clf.predict(X_test)
            #lpred = clf.predict_proba(X_test)
            #print("prediction shape: ", prediction[0])
            #print("y_test shape: ", Y_test[0])
            #print("Loss: ", log_loss(Y_test,lpred))
            micro = f1_score(Y_test, prediction, average='micro')
            macro = f1_score(Y_test, prediction, average='macro')
            micros.append(micro)
            macros.append(macro)

        micros = np.mean(micros)
        macros = np.mean(macros)

 
        df["ratio"].append(r)
        df["micro"].append(np.mean(micro))
        df["macro"].append(np.mean(macro))
        #df["alg"].append(alg)
        #df["data"].append(str(data))
        #df["total_samples"] = model.total_samples
        #df["negative"].append(model.negative)
        #df["walk_window"].append(model.walk_window)
        #df["walk_probability"].append(model.walk_probability)
        #df["L2"].append(model.l2)
        logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros))

    if fast:
        if return_y:
            return micros,macros,Y_test,prediction
        return micros,macros
    else:
        return pd.DataFrame(df)
Example #4
0
def self_projection(
    X,
    cell_types,
    classifier="LR",
    penalty="l1",
    sparsity=0.5,
    fraction=0.5,
    solver="liblinear",
    n=0,
    cv=5,
    whole=False,
    n_jobs=None,
):
    # n = 100 should be good.
    """
    This is the core function for running self-projection.

    Input
    -----
    X: `numpy.array` or sparse matrix
        the expression matrix, e.g. ad.raw.X.
    cell_types: `list of String/int`
        the cell clustering assignment
    classifier: `String` optional (defatul: 'LR')
        a machine learning model in "LR" (logistic regression), \
        "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree).
    penalty: `String` optional (default: 'l2')
        the standardization mode of logistic regression. Use 'l1' or 'l2'.
    sparsity: `fload` optional (default: 0.5)
        The sparsity parameter (C in sklearn.linear_model.LogisticRegression) for the logistic regression model.
    fraction: `float` optional (default: 0.5)
        Fraction of data included in the training set. 0.5 means use half of the data for training,
        if half of the data is fewer than maximum number of cells (n).
    n: `int` optional (default: 100)
        Maximum number of cell included in the training set for each cluster of cells.
        only fraction is used to split the dataset if n is 0.
    cv: `int` optional (default: 5)
        fold for cross-validation on the training set.
        0 means no cross-validation.
    whole: `bool` optional (default: False)
        if measure the performance on the whole dataset (include training and test).
    n_jobs: `int` optional, number of threads to use with the different classifiers (default: None - unlimited).

    return
    -----
    y_prob, y_pred, y_test, clf
    y_prob: `matrix of float`
        prediction probability
    y_pred: `list of string/int`
        predicted clustering of the test set
    y_test: `list of string/int`
        real clustering of the test set
    clf: the classifier model.
    """
    # split the data into training and testing
    if n > 0:
        X_train, X_test, y_train, y_test = train_test_split_per_type(
            X, cell_types, n=n, frac=(1 - fraction))
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, cell_types, stratify=cell_types,
            test_size=fraction)  # fraction means test size
    # set the classifier
    if classifier == "LR":
        clf = LogisticRegression(
            random_state=1,
            penalty=penalty,
            C=sparsity,
            multi_class="ovr",
            solver=solver,
        )
    elif classifier == "RF":
        clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs)
    elif classifier == "GNB":
        clf = GaussianNB()
    elif classifier == "GPC":
        clf = GaussianProcessClassifier(n_jobs=n_jobs)
    elif classifier == "SVM":
        clf = SVC(probability=True)
    elif classifier == "SH":
        clf = SGDClassifier(loss="squared_hinge", n_jobs=n_jobs)
    elif classifier == "PCP":
        clf = SGDClassifier(loss="perceptron", n_jobs=n_jobs)
    elif classifier == "DT":
        clf = DecisionTreeClassifier()

    # mean cross validation score
    cvsm = 0
    if cv > 0:
        cvs = cross_val_score(clf,
                              X_train,
                              np.array(y_train),
                              cv=cv,
                              scoring="accuracy",
                              n_jobs=n_jobs)
        cvsm = cvs.mean()
        print("Mean CV accuracy: %.4f" % cvsm)
    # accuracy on cross validation and on test set
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_train, y_train)
    print("Accuracy on the training set: %.4f" % accuracy)
    accuracy_test = clf.score(X_test, y_test)
    print("Accuracy on the hold-out set: %.4f" % accuracy_test)

    # accuracy of the whole dataset
    if whole:
        accuracy = clf.score(X, cell_types)
        print("Accuracy on the whole set: %.4f" % accuracy)

    # get predicted probability on the test set
    y_prob = None
    if not classifier in ["SH", "PCP"]:
        y_prob = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)

    return y_prob, y_pred, y_test, clf, cvsm, accuracy_test
Example #5
0
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from varname import nameof

sv = SVC()
RFC = RandomForestClassifier()
GaussianN = GaussianNB()
KNC = KNeighborsClassifier(n_neighbors=7)
xgboost = XGBClassifier()
gradientboost = GradientBoostingClassifier()

df = pd.read_csv(r'dataframes/full_csv', index_col=[0])

# with open(r'objects/wektor_lst', 'rb') as f:
#     res_wek = np.load(f)
res_wek = np.load(r'objects/wektors.npy', allow_pickle=True)
res_wek = [wek[0:20] for wek in res_wek]
zzz = np.stack(res_wek)
res_wek = zzz.reshape([7023, 2000])

scoring = ['precision', 'recall', 'f1', 'accuracy']

sv_score_array = cross_validate(sv,
    return y_pred, y_pred_prob

## function to get classifiers score 

def print_scores(y_test,y_pred,y_pred_prob):
    print('test-set confusion matrix:\n', confusion_matrix(y_test,y_pred)) 
    print("recall score: ", recall_score(y_test,y_pred))
    print("precision score: ", precision_score(y_test,y_pred))
    print("f1 score: ", f1_score(y_test,y_pred))
    print("accuracy score: ", accuracy_score(y_test,y_pred))
    print("ROC AUC: {}".format(roc_auc_score(y_test, y_pred_prob[:,1])))

#%%

# training a naive bayes model for classification 
y_pred, y_pred_prob = get_predictions(GaussianNB(), X_train, y_train, X_test)

print_scores(y_test,y_pred,y_pred_prob)

# Accuracy = 96.91 %


# hence we can see that the model has correclty classified all the 135 values as frauds/ shill bidders

#%%
# training a logistic regression model 
y_pred, y_pred_prob = get_predictions(LogisticRegression(C = 0.01, penalty = 'l1'), X_train, y_train, X_test)

print_scores(y_test,y_pred,y_pred_prob)

# Accuracy = 96.28 %
# Import libraries
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
# define data, create model and fit data
X = Variables
Y = Classes
Model = GaussianNB(params).fit(X, Y)
# Score model
Model.score(X, Y)
# Predict new classes
NewY = Model.Predict(NewX)
scaled = ['age', 'balance', 'day', 'month',
          'duration', 'campaign', 'pdays', 'previous']
bank[scaled] = sklearn.preprocessing.scale(bank[scaled].astype(float))

# Training set and targets
X = bank.drop(columns='y').values
t = bank['y'].values

#experiment 1
from sklearn.model_selection import train_test_split
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size = 0.2, shuffle = True)

#experiment 2
from sklearn.naive_bayes import GaussianNB
gaussian_clf = GaussianNB()
gaussian_clf.fit(X_train, t_train)

#experiment 3
from sklearn.metrix import confusion_matrix
gaussian_score = gaussian_clf.score(X_test, t_test)

gaussian_pred - gaussian_clf.predict(X_test)
cm = confusion_matrix(t_test, gaussian_pred)

gaussian_proba = gaussian_clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(t_test, gaussian_proba)
auc = roc_auc_score(t_test, gaussian_proba)

print "Gausian CLF Score: " + str(gaussian_score)
print "Confusion Matrix "
Example #9
0
def getNaiveBayesWinsconsinAccuracy():
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    pred_gnb = gnb.fit(data_train_feature_selected, target_train).predict(data_test_feature_selected)
    return accuracy_score(target_test, pred_gnb, normalize = True)
sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)

#Random Forest:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

#Gaussian Naive Bayes:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

#Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

#Which is the best Model ?
results = pd.DataFrame({
    'Model': [
        'Random Forest', 'Stochastic Gradient Decent', 'Gaussian Naive Bayes',
        'Decision Tree'
    'AST', 'BLK']

#Pandas DataFrame allows you to select columns.
#We use column selection to split the data into features and class.
nba_feature = nba[feature_columns]
nba_class = nba[class_column]

print(nba_feature[0:3])
print(list(nba_class[0:3]))

train_feature, test_feature, train_class, test_class = \
    train_test_split(nba_feature, nba_class, stratify=nba_class, \
    train_size=0.75, test_size=0.25, random_state=0)

training_accuracy = []
test_accuracy = []

nb = GaussianNB().fit(train_feature, train_class)
print("Test set score: {:.3f}".format(nb.score(test_feature, test_class)))
prediction = nb.predict(test_feature)
print("Confusion matrix:")
print(
    pd.crosstab(test_class,
                prediction,
                rownames=['True'],
                colnames=['Predicted'],
                margins=True))

scores = cross_val_score(nb, nba_feature, nba_class, cv=10)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))
Example #12
0
Y = df.Churn

#PREPARING TRAINING DATASET AND TEST DATASET
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=1)

import time

#GAUSSIAN(NB) NAIVE BAYES
start = time.time()
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
gaussianNB = GaussianNB()
gaussianNB.fit(X_train, Y_train)
Y_pred_gnb = gaussianNB.predict(X_test)
acc_score_gnb = round(metrics.accuracy_score(Y_test, Y_pred_gnb) * 100)
confusion_gnb = metrics.confusion_matrix(Y_test, Y_pred_gnb)
end = time.time()
proc_time_gnb = end - start
#print("Total execution time: {}".format(proc_time_gnb), "seconds")

#K-NEAREST NEIGHBORS(KNN)
start = time.time()
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)
acc_score_knn = round(metrics.accuracy_score(Y_test, Y_pred_knn) * 100)
from csm import SEA, StratifiedBagging, REA, LearnppCDS, LearnppNIE, OUSE, KMeanClustering
from strlearn.evaluators import TestThenTrain
from sklearn.naive_bayes import GaussianNB
from strlearn.metrics import (balanced_accuracy_score, f1_score,
                              geometric_mean_score_1, precision, recall,
                              specificity)
import sys
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from skmultiflow.trees import HoeffdingTree

# Select streams and methods
streams = h.realstreams2()
print(len(streams))

rea = REA(base_classifier=StratifiedBagging(base_estimator=GaussianNB(),
                                            random_state=42),
          number_of_classifiers=5)
cds = LearnppCDS(base_classifier=StratifiedBagging(base_estimator=GaussianNB(),
                                                   random_state=42),
                 number_of_classifiers=5)
nie = LearnppNIE(base_classifier=StratifiedBagging(base_estimator=GaussianNB(),
                                                   random_state=42),
                 number_of_classifiers=5)
ouse = OUSE(base_classifier=StratifiedBagging(base_estimator=GaussianNB(),
                                              random_state=42),
            number_of_classifiers=5)
kmc = KMeanClustering(base_classifier=StratifiedBagging(
    base_estimator=GaussianNB(), random_state=42),
                      number_of_classifiers=5)
ros_knorau2 = SEA(base_estimator=StratifiedBagging(base_estimator=GaussianNB(),
Example #14
0
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# need refractor here
method_dict = {
    "LG": LogisticRegression(),
    "KN": KNeighborsClassifier(),
    "SV": SVC(),
    "GB": GradientBoostingClassifier(n_estimators=1000),
    "DT": tree.DecisionTreeClassifier(),
    "RF": RandomForestClassifier(n_estimators=1000),
    "MP": MLPClassifier(alpha=1),
    "NB": GaussianNB(),
}

dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(),
    "Gradient Boosting Classifier":
    GradientBoostingClassifier(n_estimators=1000),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=1000),
    "Neural Net": MLPClassifier(alpha=1),
    "Naive Bayes": GaussianNB(),
    # "AdaBoost": AdaBoostClassifier(),
    # "QDA": QuadraticDiscriminantAnalysis(),
    # "Gaussian Process": GaussianProcessClassifier()
Example #15
0
def classify(features_train, labels_train):
    clf = GaussianNB()
    clf.fit(features_train, labels_train)
    return clf
Example #16
0
array = dataset.values
X = array[:, 0:4]
y = array[:, 4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X,
                                                                y,
                                                                test_size=0.20,
                                                                random_state=1,
                                                                shuffle=True)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear',
                                        multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
Example #17
0
a = pd.Series()
x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for i in list(range(1, 11)):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(train_X, train_Y)
    prediction = model.predict(test_X)
    a = a.append(pd.Series(metrics.accuracy_score(prediction, test_Y)))
plt.plot(a_index, a)
plt.xticks(x)
fig = plt.gcf()
fig.set_size_inches(12, 6)
plt.show()
print('Accuracies for different values of n are:', a.values,
      'with the max value as ', a.values.max())

model = GaussianNB()
model.fit(train_X, train_Y)
prediction6 = model.predict(test_X)
print('The accuracy of the NaiveBayes is',
      metrics.accuracy_score(prediction6, test_Y))

model = RandomForestClassifier(n_estimators=100)
model.fit(train_X, train_Y)
prediction7 = model.predict(test_X)
print('The accuracy of the Random Forests is',
      metrics.accuracy_score(prediction7, test_Y))

from sklearn.model_selection import KFold  #for K-fold cross validation
from sklearn.model_selection import cross_val_score  #score evaluation
from sklearn.model_selection import cross_val_predict  #prediction
kfold = KFold(n_splits=10,
n_non_POI = int(len(labels)) - n_POI
file.write('Cleaned dataset has {0:d} POI and {1:d} non POI\n'.format(
    n_POI, n_non_POI))
file.write('\n')

for classifier_name in classifier_list:

    print classifier_name
    file.write('Classifier name: {0}\n'.format(classifier_name))

    # hyperparameter setting
    parameters = {}

    if classifier_name == 'NB':
        # Naive Bayes --> no hyperparameter to tune
        classifier = GaussianNB()

    if classifier_name == 'SVC':
        # Support vector machine--> kernel and margin are tuned
        classifier = SVC()
        parameters['classifier__kernel'] = ['linear', 'poly', 'rbf']
        parameters['classifier__C'] = [10, 100, 1000]
        parameters['classifier__gamma'] = [0.01, 0.1, 1, 'scale']

    if classifier_name == 'KNN':
        # KNN --> number of neighbors, weight function and power parameters are tuned
        classifier = KNeighborsClassifier(algorithm='auto')
        parameters['classifier__n_neighbors'] = [5, 10, 15]
        parameters['classifier__weights'] = ['distance', 'uniform']
        parameters['classifier__p'] = [1, 2]
    if not pca and estimator_name not in ['GaussianNB', 'NeuralNetwork']:
        process_feature_importances(model, estimator_name, pca, fine_tune)

gridsearch_param = {'scoring': 'roc_auc', 'verbose': 2 , 'n_jobs': -1, 'cv': 3}
estimators_params_grid = {
    'LogisticRegression': {'C' : [10**i for i in range(-3,4)], 'penalty': ['l2', 'l1']},
    'DecisionTreeClassifier': {'min_samples_split': [1600, 1800, 2000, 2200, 2400]},
    'RandomForestClassifier': {'n_estimators' : [50,100,200,300,400], 'min_samples_split': [50, 100, 150, 200]},
    'LGBMClassifier': {'num_leaves': [500, 1000, 1500, 2000, 2500], 'n_estimators': [200, 400, 600, 800, 1000]},
    }

print_info('Start experiments')

experiment(LogisticRegression(random_state=SEED, n_jobs=-1, solver='saga', max_iter=500), train_x, train_y, test_x, test_y, pca = False, fine_tune = True)
experiment(DecisionTreeClassifier(random_state=SEED), train_x, train_y, test_x, test_y, pca = False, fine_tune = True)
experiment(GaussianNB(), train_x, train_y, test_x, test_y, pca = False, fine_tune = False)
experiment(RandomForestClassifier(random_state=SEED, n_jobs=-1), train_x, train_y, test_x, test_y, pca = False, fine_tune = True)

lgbm = lgb.LGBMClassifier(objective='binary',
                          random_state = SEED,
                          feature_fraction=0.7,
                          learning_rate=0.05,
                          n_jobs=-1,
                          silent = False,
                          )
experiment(lgbm, train_x, train_y, test_x, test_y, pca = False, fine_tune = True)

""" Bagging with Lightgbm (Combine boosting and bagging)"""
print_info('Start Bagging with Lightgbm')
lgbm = lgb.LGBMClassifier(objective='binary',
                          random_state = SEED,
Example #20
0
additional_features.remove('salary')
additional_features.remove('poi')
additional_features.remove('email_address')
initial_features = ['poi', 'salary']

#  initiating automatic feature search
'''
final_features_SVC = auto_feature(SVC(),
                                 my_dataset, additional_features, initial_features, iterate=2)
final_features_NB = auto_feature(GaussianNB(),
                                 my_dataset, additional_features, initial_features, iterate=2)
final_features_FR = auto_feature(RandomForestClassifier(),
                                 my_dataset, additional_features, initial_features, iterate=1)
                                 '''

final_features_NB = auto_feature(GaussianNB(),
                                 my_dataset,
                                 additional_features,
                                 initial_features,
                                 iterate=2)

# OPTIMIZING SELECTED CLASSIFIER
# ******************************
#   optimizing features in classifier using default parameters

clf_def = DecisionTreeClassifier()
optimal_features = auto_feature(clf_def,
                                my_dataset,
                                additional_features,
                                initial_features,
                                iterate=5)
Example #21
0
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

################## load data #####################
iris = datasets.load_iris()
x, y = iris.data[:, 1:3], iris.target

################## define classifier #####################
clf1 = KNeighborsClassifier(n_neighbors=1)

clf2 = RandomForestClassifier(random_state=1)

clf3 = GaussianNB()

lr = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

################## class result #####################
for clf, label in zip(
    [clf1, clf2, clf3, sclf],
    ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']):

    scores = model_selection.cross_val_score(clf,
                                             x,
                                             y,
                                             cv=3,
                                             scoring='accuracy')
Example #22
0
                    X_train = np.concatenate(
                        (X_train, np.array(weight[idx[doc]]).reshape(1, d)),
                        axis=0).reshape(cnt + 1, d)
                    Y_train = np.concatenate(
                        (Y_train, np.array([score]).reshape(1, 1)),
                        axis=0).reshape(cnt + 1, 1)
                cnt += 1
                line = next(f)

            # X_train_PCA =

            print('training...')
            train_start_t = time.time()
            #lasso.train(X_train, Y_train)                                                        #train
            #call sklearn.Lasso()
            clfRand = GaussianNB()
            clfRand.fit(X_train, Y_train)
            train_end_t = time.time()
            print('training finish, cost time: %d' %
                  (train_end_t - train_start_t))

            test_set = dict()
            f = line_reader('F:/ECNU/Course/KnowledgeAna/tag_data_set/' + QID +
                            '/test_set')
            line = next(f)
            print('loading testing set...')
            cnt = 0
            while line:
                content = re.split('\s+', line)
                doc = content[0]
                score = float(content[1])
Example #23
0
scoring = 'accuracy'

# Classifiers
names = ["Nearest Neighbors", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "SVM Linear", "SVM RBF", "SVM Sigmoid"]

classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10,
    max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel = 'linear'),
    SVC(kernel = 'rbf'),
    SVC(kernel = 'sigmoid')
]
models = zip(names, classifiers)

# evaluation classifiers
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
Example #24
0
 def __init__(self, **kwargs):
     super(NaiveBayes, self).__init__()
     super(NaiveBayes, self).SetModel(GaussianNB(**kwargs))
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(scaled_X_train, Y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(scaled_X_test, Y_test)))

pred_lda = lda.predict(scaled_X_test)
print(confusion_matrix(Y_test, pred_lda))
print(classification_report(Y_test, pred_lda))


# In[47]:


#fit a naive bayes model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(scaled_X_train, Y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(scaled_X_train, Y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(scaled_X_test, Y_test)))

pred_gnb = gnb.predict(scaled_X_test)
print(confusion_matrix(Y_test, pred_gnb))
print(classification_report(Y_test, pred_gnb))


# In[48]:


#fit a svm classifier
 def __init__(self):
     super(GaussianNaiveBayes, self).__init__(name="Gaussian Bayes")
     self.model = GaussianNB()
Example #27
0
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()




#########################################################
### your code goes here ###

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"

t0 = time()
pred = clf.predict(features_test)
print "testing time:", round(time()-t0, 3), "s"

accuracy = clf.score(features_test, labels_test)
print "Accuracy: "
print accuracy
#########################################################

training_L =[]
testing_L=[]
for i in range(len(training)):
    training_L.append([training[i]])

for i in range(len(testing)):
    testing_L.append([testing[i]])

classes = []
for i in cmu_sum:
    if (i in less_than):
        classes.append(0)
    else:
        classes.append(1)

class_train= classes[0:int(len(classes) * split_percentage)]
class_test=classes[len(class_train):len(classes)]

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(training_L, class_train)

predicts = model.predict(testing_L)

# Calculate Accuracy Rate by using accuracy_score()
print ("Accuracy Rate: %f" % accuracy_score(class_test, predicts))

def evaluate_multilabel(model, data, alg = None, classifier="lr",fast=False,ratio = None, cv = 10, random_state = None,normalize=False):
    X = []
    Y = []
    for pid in range(len(model.word2id)):
        X.append(model.word_embeddings[pid])
        
    Y = np.zeros((len(X),len(data.labels)))
    
    for y,key in enumerate(data.labels.keys()):
        for index,paper in enumerate(data.labels[key]):
            pid = model.word2id[paper]
            Y[pid][y] = 1
    if normalize:
        X = sk_normalize(X)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    df = defaultdict(list)
    if ratio is None:
        ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] 
        
    for r in ratio:
        if r <= 0:
            continue
        elif r >= 1:
            break

        if classifier.lower() == 'lr':
            clf = LogisticRegression()
        elif classifier.lower() == "svm":
            clf = SVC(cache_size=5000)
        elif classifier.lower() == "mlp":
            clf = MLPClassifier()
        elif classifier.lower() == "nb":
            clf = GaussianNB()
            
        micros = []
        macros = []
        for i in range(cv):
            micro,macro = evaluateNodeClassification(X,Y,1-r,clf=clf,random_state = random_state)
            micros.append(micro)
            macros.append(macro)
        micros = np.mean(micros)
        macros = np.mean(macros)
     
        df["ratio"].append(r)
        df["micro"].append(micros)
        df["macro"].append(macros)
        #df["alg"].append(alg)
        #df["data"].append(str(data))
        #df["total_samples"].append(model.total_samples)
        #df["negative"].append(model.negative)
        #df["walk_window"].append(model.walk_window)
        #df["walk_probability"].append(model.walk_probability)   
        #df["L2"].append(model.l2)   
       
        logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros))
        
        
    if fast:
        return micros,macros
    else:
        return df
Example #30
0
import numpy as np
import sys

sys.path.append("..")
from utile.Processor import processor  # noqa
from utile.Timer import timer  # noqa

data_frame = load_iris()
input_data = data_frame.data
data_targets = data_frame.target
X_train, X_test, y_train, y_test = train_test_split(input_data, data_targets, test_size=0.2)

model_knc = KNeighborsClassifier(n_neighbors=5)
model_svc = SVC()
model_rfc = RandomForestClassifier(n_estimators=10)
model_gnb = GaussianNB()
model_mnb = MultinomialNB()


def knc_modeler():
    model_knc.fit(X_train, y_train)
    value = model_knc.score(X_test, y_test)
    return value


def svc_modeler():
    model_svc.fit(X_train, y_train)
    value = model_svc.score(X_test, y_test)
    return value