Example #1
0
def GaussianBayes(X, Y):
    clf = GaussianNB()
    clf.fit(X, Y)
    data = [[6.9, 3.1, 5.1, 2.3]]
    print clf.predict(data)
    print clf.predict_proba(data)
    print clf.predict_log_proba(data)
Example #2
0
def result(request):
    data = pd.read_csv(
        r'C:\Users\User\Desktop\DiabetesPrediction\diabetes.csv')

    from sklearn.model_selection import train_test_split
    x = data.drop('Outcome', axis=1)
    y = data['Outcome']
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.20, random_state=42, stratify=data['Outcome'])

    from sklearn.naive_bayes import GaussianNB
    nb = GaussianNB()
    nb.fit(x_train, y_train)

    val1 = float(request.GET['n1'])
    val2 = float(request.GET['n2'])
    val3 = float(request.GET['n3'])
    val4 = float(request.GET['n4'])
    val5 = float(request.GET['n5'])
    val6 = float(request.GET['n6'])
    val7 = float(request.GET['n7'])
    val8 = float(request.GET['n8'])

    pred = nb.predict([[val1, val2, val3, val4, val5, val6, val7, val8]])
    pred_prob = nb.predict_log_proba(
        [[val1, val2, val3, val4, val5, val6, val7, val8]])
    result1 = ""
    print("Prob  of diab= {}".format(pred_prob))
    if pred == [1]:
        result1 = "POSITIVE"
    else:
        result1 = "NEGATIVE"

    return render(request, 'predict.html', {"result2": result1})
Example #3
0
def classify_audio(request):
    '#0.Step: Get data for classification'
    data = request.body
    '#1.Step: Check if shape is dividy by 2 zero'
    if len(data) % 2 == 0:
        data_float_raw = librosa.util.buf_to_float(data)
    else:
        data_float_raw = librosa.util.buf_to_float(data[:-1])

    '2.Step: # Trim the beginning and ending silence'
    data_float, index = librosa.effects.trim(data_float_raw)
    '#0.1.Step: Get mfcc feature for data'
    prediction_mfcc = get_mfcc_feature_data(data_float)
    '#0.2.Step: Flatten mfcc '
    prediction_mfcc_fl = prediction_mfcc.flatten()
    '#1.Step: Load all data from dbase table Memory'
    df = pd.DataFrame(list(Memory.objects.all().values()))
    '#2.Step: Create train_label and train_data'
    train_data_list = []
    for i in range(0, len(df)):
        train_data_list.append(
            bytes_numpy(df.loc[i, "blob_data_mfcc"]).flatten())
    train_data = np.array(train_data_list)

    train_label = df["ground_truth"].values
    '#3.Step: Fit bayes classifier'
    clf = GaussianNB()
    clf.fit(train_data, train_label)
    '#4.Step: Make prediction'
    prediction = clf.predict([prediction_mfcc_fl])
    print(prediction)
    '# Make relative prediction'
    relative_predict = clf.predict_log_proba([
        prediction_mfcc_fl
    ]) / clf.predict_log_proba([prediction_mfcc_fl]).sum()
    relative_predict_round_flat = np.around(relative_predict * 100,
                                            4).flatten()

    '#Step: Combine the classes and the relative'
    result_dict = {}
    for el_cl, el_pre in zip(clf.classes_, relative_predict_round_flat):
        result_dict[el_cl] = el_pre

    '#Step:Sort the dict'
    d_sorted = dict(sorted(result_dict.items(), key=lambda kv: kv[1]))
    print(d_sorted)
    return JsonResponse({"prediction": d_sorted})
def nb_predict(clf, Xtest, cc):
    # Classes returned in order 0,1
    #clf.classes_: array([0, 1])
    log0, log1 = GaussianNB.predict_log_proba(clf, Xtest)[0]
    log_odds = log1 - log0
    if log_odds > cc:
        return 1
    return 0
def nb_predict(clf,Xtest,cc):
    # Classes returned in order 0,1
    #clf.classes_: array([0, 1])
    log0,log1 = GaussianNB.predict_log_proba(clf,Xtest)[0]
    log_odds = log1 - log0
    if log_odds > cc:
        return 1
    return 0
Example #6
0
class NBEnsembler(Ensembler):
    def __init__(self) -> None:
        self.model = GaussianNB()

    def _fit(self, x, y):
        self.model.fit(x, y)

    def _predict(self, x):
        return self.model.predict_log_proba(x)
Example #7
0
def GaussianNBLocalModel(localTrainFeature, localTestFeature, localTrainLabel,
                         config):
    print 'train...'
    model = GaussianNB()
    model.fit(X=localTrainFeature.toarray(), y=localTrainLabel)
    print 'predict...'
    if config['prob'] == False:
        return model.predict(localTestFeature.toarray())
    else:
        return model.predict_log_proba(localTestFeature.toarray())
class NaiveBayes:
    __theta = 0
    __sigma = 0

    def __init__(self):
        pass 
        #self.__new_data = 0

    def learning(self,x_data,y_data):
        self.rssi = np.loadtxt(x_data, delimiter=',')
        print(self.rssi)

        self.position = np.loadtxt(y_data, delimiter=',')
        print(self.position)

        self.gaussian_nb = GaussianNB()

        from sklearn.cross_validation import train_test_split
        rssi_train, rssi_test, position_train, position_test = train_test_split(self.rssi, self.position, random_state=0)

        self.gaussian_nb.fit(rssi_train,position_train)
        print("theta",self.gaussian_nb.theta_)
        print("sigma",self.gaussian_nb.sigma_)

        predicted = self.gaussian_nb.predict(rssi_test)

        print(metrics.accuracy_score(position_test, predicted))
    '''
    def set_params(self,theta,sigma):
        __theta = theta
        __sigma = sigma
        print __theta
        print __sigma
        '''

    def inference(self,r_data):
        self.predicted_class = self.gaussian_nb.predict(r_data)

        post_prob = self.gaussian_nb.predict_proba(r_data)
        log_prob = self.gaussian_nb.predict_log_proba(r_data)
        self.post_prob_float16 = post_prob.astype(np.float16)
        #E = 1*self.post_prob_float16[0][0]+2*self.post_prob_float16[0][1]+3*self.post_prob_float16[0][2]
        #var = (1*self.post_prob_float16[0][0]+4*self.post_prob_float16[0][1]+9*self.post_prob_float16[0][2])-E**2
        #print(self.post_prob_float16)
        #print(self.post_prob_float16[0])
        #print(var)
        print(self.predicted_class)
        #print(self.gaussian_nb.class_prior_)
        #print(log_prob)

        return self.predicted_class

    def output(self):
        output = graph.Graph()
        output.bar_graph(self.post_prob_float16[0])
Example #9
0
def demoOne():
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    y = np.array([1, 1, 1, 2, 2, 2])

    clf = GaussianNB(priors=None)
    clf.fit(X, y)
    print(clf.predict([[-0.8, -1]]))
    print('predict_prob: ', clf.predict_proba([[-0.8, -1]]))
    print('predict_log_prob: ', clf.predict_log_proba([[-0.8, -1]]))
    print(clf.score([[-0.8, -1]], clf.predict([[-0.8, -1]])))
    print(clf.partial_fit(X, y, classes=np.unique(y)))
    print(clf.set_params())
    return X, y
Example #10
0
class ClassifierNavi:
    """Naive Bayesian classifier to determine if pixels are navigatable"""

    # pylint: disable=too-few-public-methods

    def __init__(self):
        """Construct the navigatable pixels classifier"""
        self.__cls = GaussianNB()
        self.__cls.fit(TRAINING_X, TRAINING_NAVI)

    def predict(self, img):
        """Returns ln p(color | navigatable) - ln p(color | obstacle)"""
        scores = self.__cls.predict_log_proba(img.reshape(-1, 3))
        return (scores[:, 1] - scores[:, 0]).reshape(img.shape[:2])
Example #11
0
def test_gnb():
    """
    Gaussian Naive Bayes classification.

    This checks that GaussianNB implements fit and predict and returns
    correct values for a simple toy dataset.
    """

    clf = GaussianNB()
    y_pred = clf.fit(X, y).predict(X)
    assert_array_equal(y_pred, y)

    y_pred_proba = clf.predict_proba(X)
    y_pred_log_proba = clf.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
Example #12
0
def test_gnb():
    """
    Gaussian Naive Bayes classification.

    This checks that GaussianNB implements fit and predict and returns
    correct values for a simple toy dataset.
    """

    clf = GaussianNB()
    y_pred = clf.fit(X, y).predict(X)
    assert_array_equal(y_pred, y)

    y_pred_proba = clf.predict_proba(X)
    y_pred_log_proba = clf.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
def test_gnb():
    # Gaussian Naive Bayes classification.
    # This checks that GaussianNB implements fit and predict and returns
    # correct values for a simple toy dataset.

    clf = GaussianNB()
    y_pred = clf.fit(X, y).predict(X)
    assert_array_equal(y_pred, y)

    y_pred_proba = clf.predict_proba(X)
    y_pred_log_proba = clf.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)

    # Test whether label mismatch between target y and classes raises
    # an Error
    # FIXME Remove this test once the more general partial_fit tests are merged
    assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1])
Example #14
0
def test_gnb():
    # Gaussian Naive Bayes classification.
    # This checks that GaussianNB implements fit and predict and returns
    # correct values for a simple toy dataset.

    clf = GaussianNB()
    y_pred = clf.fit(X, y).predict(X)
    assert_array_equal(y_pred, y)

    y_pred_proba = clf.predict_proba(X)
    y_pred_log_proba = clf.predict_log_proba(X)
    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)

    # Test whether label mismatch between target y and classes raises
    # an Error
    # FIXME Remove this test once the more general partial_fit tests are merged
    assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1])
Example #15
0
class NBEnsembler(Ensembler):
    def __init__(self, model=None) -> None:
        if model is None:
            self.model = GaussianNB()
        else:
            self.model = model

    def _fit(self, x, y):
        self.model.fit(x, y)

    def _predict(self, x):
        return self.model.predict_log_proba(x)

    def serialize(self):
        return pickle.dumps(self.model)
    
    @classmethod
    def construct(cls, content):
        return cls(pickle.loads(content))
Example #16
0
def GaussianNBPredictModel(localTrainLabel, config):
    train = pd.read_csv('../feature/trainQlist.csv', header=0, sep=",")
    test = pd.read_csv('../feature/testQlist.csv', header=0, sep=",")
    print "Train tf-idf vector Model..."
    encode = TfidfVectorizer(decode_error='ignore',
                             norm="l2",
                             binary=False,
                             sublinear_tf=True,
                             min_df=50)
    localTrainFeature = encode.fit_transform(train['qlist'].values)
    localTestFeature = encode.transform(train['qlist'].values)

    print localTrainFeature.shape, localTestFeature.shape

    print 'train...'
    model = GaussianNB()
    model.fit(X=localTrainFeature.toarray(), y=localTrainLabel)
    print 'predict...'
    if config['prob'] == False:
        return model.predict(localTestFeature.toarray()), test['uid'].values
    else:
        return model.predict_log_proba(
            localTestFeature.toarray()), test['uid'].values
Example #17
0
def main():

    #Training Data
    s1_batch, s1_label = load_data('./data/part2/Subject_1.csv',
                                   './data/part2/list_1.csv')
    s4_batch, s4_label = load_data('./data/part2/Subject_4.csv',
                                   './data/part2/list_4.csv')
    s6_batch, s6_label = load_data('./data/part2/Subject_6.csv',
                                   './data/part2/list_6.csv')
    s9_batch, s9_label = load_data('./data/part2/Subject_9.csv',
                                   './data/part2/list_9.csv')

    in_1, in_1_label = load_data('./data/part1/Subject_2_part1.csv',
                                 './data/part1/list2_part1.csv')
    in_2, in_2_label = load_data('./data/part1/Subject_7_part1.csv',
                                 './data/part1/list_7_part1.csv')

    #Training Features
    a = np.concatenate((s1_batch, s4_batch), axis=0)
    b = np.concatenate((s6_batch, s9_batch), axis=0)
    big_batch = np.concatenate((a, b), axis=0)

    #Training Labels
    c = np.concatenate((s1_label, s4_label), axis=0)
    d = np.concatenate((s6_label, s9_label), axis=0)
    big_label = np.concatenate((c, d), axis=0)

    #Testing Data
    test_batch = load_test_data(
        "data/final_test/general/general_test_instances.csv")
    in_1_test_ft = load_test_data(
        "data/final_test/subject2/subject2_instances.csv")
    in_2_test_ft = load_test_data(
        "data/final_test/subject7/subject7_instances.csv")
    #train = sum_data(big_batch, window_size)
    #test = sum_data(test_batch, 7)

    if (sys.argv[1] == "group"):
        print "Starting kFold validation on group of individuals"
        norm_bigbatch = preprocessing.normalize(big_batch, norm='l2')
        gptrain, gptrainl, gptest, gptestl = kFold(norm_bigbatch, big_label)
        group_model = GaussianNB()
        group_model.fit(gptrain, gptrainl)
        group_predict = group_model.predict(gptest)
        group_scores = group_model.predict_log_proba(gptest)
        group_pred_int = np.array(group_predict.astype(float))[np.newaxis]
        trans_goal = np.array(gptestl.astype(int))[np.newaxis]
        trans_goal = trans_goal.T
        alldata = np.append(group_scores, group_pred_int.T, axis=1)
        alldata = alldata[:, [0, 2]]
        np.savetxt('pred.csv', alldata, delimiter=',', fmt=['%f', '%d'])
        np.savetxt('gold.csv', trans_goal, delimiter=',', fmt='%d')
        print "Generated pred.csv and gold.csv on current directory!"
        sys.exit(0)

    elif (sys.argv[1] == "individual"):
        if (len(sys.argv) < 3):
            print "Please specify the individual's number (2 or 7) in the third argument"
            sys.exit(0)
        if (int(sys.argv[2]) == 2):
            train_data = in_1
            train_label = in_1_label
        elif (int(sys.argv[2]) == 7):
            train_data = in_2
            train_label = in_2_label
        else:
            print "Please specify the individual's number (2 or 7)"
            sys.exit(0)
        print "Starting kFold validation on individual"
        norm_data = preprocessing.normalize(train_data, norm='l2')
        gptrain, gptrainl, gptest, gptestl = kFold(norm_data, train_label)
        group_model = GaussianNB()
        group_model.fit(gptrain, gptrainl)
        group_predict = group_model.predict(gptest)
        group_scores = group_model.predict_log_proba(gptest)
        group_pred_int = np.array(group_predict.astype(float))[np.newaxis]
        trans_goal = np.array(gptestl.astype(int))[np.newaxis]
        trans_goal = trans_goal.T
        alldata = np.append(group_scores, group_pred_int.T, axis=1)
        alldata = alldata[:, [0, 2]]
        np.savetxt('pred.csv', alldata, delimiter=',', fmt=['%f', '%d'])
        np.savetxt('gold.csv', trans_goal, delimiter=',', fmt='%d')
        print "Generated pred.csv and gold.csv on current directory!"
        sys.exit(0)

    normal_train_group = preprocessing.normalize(big_batch, norm='l2')
    normal_test_group = preprocessing.normalize(test_batch, norm='l2')
    #print test_batch[0]
    normal_train_in_1 = preprocessing.normalize(in_1, norm='l2')
    normal_test_in_1 = preprocessing.normalize(in_1_test_ft, norm='l2')

    normal_train_in_2 = preprocessing.normalize(in_2, norm='l2')
    normal_test_in_2 = preprocessing.normalize(in_2_test_ft, norm='l2')

    model = GaussianNB()
    model_1 = GaussianNB()
    model_2 = GaussianNB()

    model.fit(normal_train_group, big_label)
    model_1.fit(normal_train_in_1, in_1_label)
    model_2.fit(normal_train_in_2, in_2_label)

    predicted = model.predict(normal_test_group)
    predict_1 = model_1.predict(normal_test_in_1)
    predict_2 = model_2.predict(normal_test_in_2)

    #scores = model.predict_proba(normal_test)
    scores = model.predict_log_proba(normal_test_group)
    scores_1 = model_1.predict_log_proba(normal_test_in_1)
    scores_2 = model_2.predict_log_proba(normal_test_in_2)
    #scores = model.score(train_data,train_label)
    #print(scores)

    predicted_int = np.array(predicted.astype(float))[np.newaxis]
    predicted_int_1 = np.array(predict_1.astype(float))[np.newaxis]
    predicted_int_2 = np.array(predict_2.astype(float))[np.newaxis]

    #Changing 'test_label' to change the gold
    #trans_goal = np.array(in_2_test_label.astype(int))[np.newaxis]
    #trans_goal = trans_goal.T

    alldata = np.append(scores, predicted_int.T, axis=1)
    #alldata[:,0] = alldata[:,0] / 100.00
    alldata = alldata[:, [0, 2]]

    in_1_data = np.append(scores_1, predicted_int_1.T, axis=1)
    #in_1_data[:,0] = in_1_data[:,0] / 100.00
    in_1_data = in_1_data[:, [0, 2]]

    in_2_data = np.append(scores_2, predicted_int_2.T, axis=1)
    #in_2_data[:,0] = in_2_data[:,0] / 100.00
    in_2_data = in_2_data[:, [0, 2]]

    #print alldata
    np.savetxt('results/general_pred2.csv',
               alldata,
               delimiter=',',
               fmt=['%f', '%d'])
    np.savetxt('results/individual1_pred2.csv',
               in_1_data,
               delimiter=',',
               fmt=['%f', '%d'])
    np.savetxt('results/individual2_pred2.csv',
               in_2_data,
               delimiter=',',
               fmt=['%f', '%d'])
            for i in range(len(scans_testing)):
                prediction = nb.predict(scans_testing[i])
                if prediction == labels_testing[i]:
                    correct += 1
            #print (correct / len(scans_testing)) * 100, "%"
            total_correct += correct
            total_scans += len(scans_testing)
        else:
            # 6: Classify per group
            #print "Classifying using test data"
            correct = 0
            count = 0
            sum_p = 0
            sum_s = 0
            for i in range(len(scans_testing)):
                prediction = nb.predict_log_proba(scans_testing[i])
                sum_p += prediction[0][0]
                sum_s += prediction[0][1]

                if i % 10 == 9:
                    group_prediction = 'P' if sum_p > sum_s else 'S'
                    sum_p = 0
                    sum_s = 0
                    count += 1
                    if group_prediction == labels_testing[i]:
                        correct += 1

            #print (correct / count) * 100, "%"
            total_correct += correct
            total_scans += count
    # state prediction
    y_pred_bayes = model_bayes.predict(X_test_flatten)

    # print output time remarks
    outputSteps(y_pred_bayes)

    # probability calculation
    def get_prob(y_prob):
        prob_actions = []
        for i in y_prob:
            prob_actions.append(max(i))
        prob_seq = sum(prob_actions)
        return prob_seq

    # Probability to model sequence
    y_prob = model_bayes.predict_log_proba(X_test_flatten)
    prob_results.append(math.exp(get_prob(y_prob)))

    # Probability to model random sequence
    y_prob_random_seq = model_bayes.predict_log_proba(X_random)
    prob_results_random.append(math.exp(get_prob(y_prob_random_seq)))

    # Probability to model heuristic sequence
    y_prob_heuristic_seq = model_bayes.predict_log_proba(X_heuristic)
    prob_results_heuristic.append(math.exp(get_prob(y_prob_heuristic_seq)))

    target_names = list(step_set)
    target_names = sorted(target_names)

    # ward metrics for event-based scoring and segment scoring
    #ward_segment_scoring(y_test_ground_truth, y_pred_bayes)
Example #20
0
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.naive_bayes import GaussianNB

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])

# 高斯分布
clf = GaussianNB()

# 拟合数据集
clf.fit(X, Y)

# 分批拟合数据集
clf.partial_fit(X, Y)

# 返回预测结果
clf.predict([[-0.8, -1]])

# 返回每个分类的预测概率
clf.predict_proba([[-0.8, -1]])

# 返回每个分类预测的概率的一个对数转化
clf.predict_log_proba([[-0.8, -1]])
Example #21
0
    normal_error = np.c_[normal_error, np.zeros(len(normal_error))]
    abno_error = np.c_[abno_error, np.ones(len(abno_error))]


    dataset = np.r_[normal_error, abno_error]
    np.random.shuffle(dataset)

    train_x, test_x, train_y, test_y = train_test_split(dataset[:,:-1], dataset[:,-1], test_size=0.3, random_state=42)


    clf = GaussianNB()
    clf.fit(train_x, train_y)
    y_hat = clf.predict(train_x)
    y_score = clf.predict_proba(train_x)
    y_log_score = clf.predict_log_proba(train_x)
    y_test_hat = clf.predict(test_x)
    y_test_score = clf.predict_proba(test_x)
    print(accuracy_score(train_y, y_hat))
    print(metrics.recall_score(train_y, y_hat))
    print(metrics.classification_report(train_y, y_hat))
    print(metrics.classification_report(test_y, y_test_hat))
    print(y_score)
    print(y_test_score)
    print(y_test_hat)
    print(clf.classes_)

    # fpr, tpr, thresholds = metrics.roc_curve(train_y, y_hat)
    fpr, tpr, thresholds = metrics.roc_curve(test_y, y_test_score[:,-1])
    print(fpr, tpr, thresholds)
y_train_pred_logreg = logreg.predict(X_train)
cm_logreg_test = metrics.confusion_matrix(y_test, y_test_pred_logreg)
cm_logreg_train = metrics.confusion_matrix(y_train, y_train_pred_logreg)
yprobab_logreg = logreg.predict_proba(X_test)

# Classification with Naive Bayes
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)
score_train_naive_bayes = naive_bayes_classifier.score(X_train, y_train)
score_test_naive_bayes = naive_bayes_classifier.score(X_test, y_test)
y_test_pred_naive_bayes = naive_bayes_classifier.predict(X_test)
y_train_pred_naive_bayes = naive_bayes_classifier.predict(X_train)
cm_naive_bayes_train = metrics.confusion_matrix(y_train,
                                                y_train_pred_naive_bayes)
cm_naive_bayes_test = metrics.confusion_matrix(y_test, y_test_pred_naive_bayes)
yprobab_naive_bayes = naive_bayes_classifier.predict_log_proba(X_test)

# Classification with Nearest Neighbor
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
score_train_knn = knn_classifier.score(X_train, y_train)
score_test_knn = knn_classifier.score(X_test, y_test)
y_test_pred_knn = knn_classifier.predict(X_test)
y_train_pred_knn = knn_classifier.predict(X_train)
cm_knn_train = metrics.confusion_matrix(y_train, y_train_pred_knn)
cm_knn_test = metrics.confusion_matrix(y_test, y_test_pred_knn)
yprobab_knn = knn_classifier.predict_proba(X_test)

# Classification with Neural Network
neural_classifier = MLPClassifier(hidden_layer_sizes=(16),
                                  activation='logistic',
clf.partial_fit(iris.data, iris.target,classes=[0,1,2])

'''

#学习后模型中的一些参数
clf.set_params(
    priors=[0.333, 0.333,
            0.333])  #这里要设一下各个类标记对应的先验概率,如果不设置直接clf.priors返回的是None(不知道为什么?)
print(clf.priors)  #获取各个类标记对应的先验概率
print(clf.class_prior_
      )  #同priors一样,都是获取各个类标记对应的先验概率,区别在于priors属性返回列表,class_prior_返回的是数组
print(clf.get_params(deep=True))  #返回priors与其参数值组成字典

print(clf.class_count_)  #获取各类标记对应的训练样本数
print(clf.theta_)  #获取各个类标记在各个特征上的均值
print(clf.sigma_)  #获取各个类标记在各个特征上的方差

#测试数据
data_test = np.array([6, 4, 6, 2])
data = data_test.reshape(1, -1)
Result_predict = clf.predict(data)
Score = clf.score([[6, 8, 5, 3], [5, 3, 4, 2], [4, 6, 7, 2]], [2, 0, 1],
                  sample_weight=[0.3, 0.5, 0.2])

Result_predict_proba = clf.predict_proba(data)
Result_predict_log_proba = clf.predict_log_proba(data)
print(Result_predict)  #预测所属类别
print(Result_predict_proba)  #输出测试样本在各个类标记上预测概率值
print(Result_predict_log_proba)  #输出测试样本在各个类标记上预测概率值对应对数值
print(Score)  #返回测试样本映射到指定类标记上的得分(准确率)
Example #24
0
                                                                 n_samples)
y = np.zeros(3 * n_samples,)
y[n_samples:2 * n_samples] = 1
y[2 * n_samples:3 * n_samples, ] = 2


# Gaussain Naiv Bais

clf = GaussianNB()
clf.fit(X, y)

display_1 = [2, 2]
display_2 = [3, 1]
display_3 = [2.5, 2.5]

values_proba_gnb_1 = np.exp(clf.predict_log_proba(display_1))[0]
values_proba_gnb_2 = np.exp(clf.predict_log_proba(display_2))[0]
values_proba_gnb_3 = np.exp(clf.predict_log_proba(display_3))[0]

ig1_bis = plt.figure()
plot_2d(X, y)

resolution_param = 50  # 500 for nice plotting, 50 for fast version
color_text = '#ff8101'

frontiere(lambda xx: clf.predict(xx), X, step=resolution_param)
plt.annotate(r'' + '(%.2f' % values_proba_gnb_1[0] + ', %.2f'
             % values_proba_gnb_1[1] + ', %.2f)' % values_proba_gnb_1[2],
             xy=(display_1[0], display_1[1]), xycoords='data',
             color =color_text, xytext=(-150, +100),
             textcoords='offset points', fontsize=12,
Example #25
0
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

clf.fit(X, Y)
print "==Predict result by predict=="
print(clf.predict([[-0.8, -1]]))
print "==Predict result by predict_proba=="
print(clf.predict_proba([[-0.8, -1]]))
print "==Predict result by predict_log_proba=="
print(clf.predict_log_proba([[-0.8, -1]]))
Example #26
0
import numpy
from sklearn.naive_bayes import GaussianNB

__author__ = 'mkk'

if __name__ == "__main__":
    nb = GaussianNB()
    ctx1 = numpy.array([1,2,3,4,5,6,7,8,9,100])
    ctx2 = numpy.array([1,2,3,4,5,16,7,18,9,100])
    obs = numpy.array([0.1,5,5,5,5,15,5,15,5,115])

    nb.fit(numpy.array([ctx1, ctx2]), [1,2])
    print(nb.predict_log_proba(obs))
Example #27
0
def NB(X_test, X_train, y_train):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    prob = gnb.predict_log_proba(X_test)
    return y_pred, [max(prob[i]) for i in range(len(y_pred))]
Example #28
0
predict,predict_log_proba和predict_proba
* predict方法就是我们最常用的预测方法,直接给出测试集的预测类别输出。

* predict_proba则不同,它会给出测试集样本在各个类别上预测的概率。
容易理解,predict_proba预测出的各个类别概率里的最大值对应的类别,也就是predict方法得到类别。

* predict_log_proba和predict_proba类似,它会给出测试集样本在各个类别上预测的概率的一个对数转化。
转化后predict_log_proba预测出的各个类别对数概率里的最大值对应的类别,也就是predict方法得到类别。
'''
import numpy as np

from sklearn.naive_bayes import GaussianNB

#生成数据集
X_train = np.array([-1, -1, -2, -1, -3, -2, 1, 1, 2, 1, 3, 2]).reshape(6, 2)
y_train = np.array([1, 1, 1, 2, 2, 2])

X_test = np.array([-0.8, -1]).reshape(1, 2)

#构建模型
naive_bayes_Gaussian = GaussianNB()

naive_bayes_Gaussian.fit(X_train, y_train)

y_predict = naive_bayes_Gaussian.predict(X_test)
y_predict_proba = naive_bayes_Gaussian.predict_proba(X_test)
y_predict_log_proba = naive_bayes_Gaussian.predict_log_proba(X_test)

print('predict: \n', y_predict)
print('predict_proba: \n', y_predict_proba)
print('predict_log_proba: \n', y_predict_log_proba)
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 1, 2, 2])
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB(priors=None)
clf.fit(X, Y)
# GaussianNB(priors=None)
print(clf.predict([[-0.8, -1]]))

print(clf.class_prior_)  # probability of each class.
print(clf.class_count_)  # number of training samples observed in each class.
print(clf.theta_)  # mean of each feature per class.
print(clf.sigma_)  # variance of each feature per class
print(clf.predict_proba(
    [[0.8, 1]]))  # Return probability estimates for the test vector X.
print(clf.predict_log_proba(
    [[0.8, 1]]))  # Return log-probability estimates for the test vector X.
print(clf.score(
    [[0.8, 1]],
    [1]))  # Returns the mean accuracy on the given test data and labels.
print(clf.score(
    [[0.8, 1]],
    [2]))  # Returns the mean accuracy on the given test data and labels.
"""
========================================================================================================================
======================= Classification applications on the handwritten digits data =====================================
========================================================================================================================
In this example, you will see two different applications of Naive Bayesian Algorithm on the digits data set.
"""
print(__doc__)
import pylab as pl
from sklearn.datasets import load_digits
#
# for i in range(G_x.shape[0]):
#     for c in labels:
#         G_x[i, c-1] = -0.5 * np.matmul(np.matmul(np.atleast_2d(selectedArray[i, :] - Means[c-1, :]),
#                                                  np.atleast_2d(InvCov[:, :, c-1])),
#                                        np.atleast_2d(selectedArray[i, :] - Means[c-1, :]).T) - 0.5 * np.log(Det[c-1])
#
# print(time.time() - t0)
# plt.figure(), plt.imshow(np.reshape(np.argmax(G_x, axis=1)+1, (349, 1050)), cmap='jet'), plt.show()


t0 = time.time()
GNB = GaussianNB()
GNB.fit(selectedArray[np.where(train != 0)[0]], train[train != 0])
G_x_skl1 = GNB.predict(selectedArray)
G_x_skl2 = GNB.predict_log_proba(selectedArray)
print(time.time() - t0)

plt.figure()
plt.subplot(211), plt.title('My result')
plt.imshow(np.reshape(np.argmax(G_x, axis=1)+1, (349, 1050)), cmap='jet')
plt.subplot(212), plt.title('SKLearn result')
plt.imshow(np.reshape(G_x_skl1, (349, 1050)), cmap='jet')
plt.show()

plt.figure()
for l in labels:
    plt.subplot(3, 3, l)
    plt.imshow(np.reshape(G_x_skl2[:, l-1], (349, 1050))**.55)
plt.show()
Example #31
0
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 10 01:13:05 2018

@author: jenny
"""

import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
#拟合数据
clf.fit(X, Y)
print("==使用predict进行预测的结果==")
print(clf.predict([[-0.8, -1]]))
print("==使用predict_proba进行预测的结果===")
print(clf.predict_proba([[-0.8, -1]]))
print("==使用predict_log_proba进行预测的结果==")
print(clf.predict_log_proba([[-0.8, -1]]))
def Classifier(train, trainlabel, test):
    clf = GaussianNB()
    clf.fit(train, trainlabel)
    predicted = clf.predict(test)
    posterior = clf.predict_log_proba(test)
    return predicted, posterior
def takeInput(data):
    corpus = []
    # Predicting the Test set results
    prediction_list = []

    for i in range(0, dataset_total_rows):
        print(dataset['sentance'][i])
        review = re.sub('[^a-zA-Z]', ' ', dataset['sentance'][i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [
            ps.stem(word) for word in review
            if not word in set(stopwords.words('english'))
        ]
        review = ' '.join(review)
        corpus.append(review)
        print(review)

    new_text_by_user = data
    review2 = re.sub('[^a-zA-Z]', ' ', new_text_by_user)
    review2 = review2.lower()
    review2 = review2.split()
    ps2 = PorterStemmer()
    review2 = [
        ps2.stem(word) for word in review2
        if not word in set(stopwords.words('english'))
    ]
    review2 = ' '.join(review2)
    corpus.append(review2)

    Logger("Corpus length " + str(len(corpus)))

    # Creating the Bag of Words model
    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer(max_features=1500)
    X = cv.fit_transform(corpus).toarray()

    X_formatted = np.array([X[-1, :]])

    y = dataset.iloc[:, 1].values

    #categorical data
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(y)
    y = le.transform(y)

    X_except_last_row = X[:-1, :]
    # Splitting the dataset into the Training set and Test set
    #from sklearn.model_selection import train_test_split
    #X_train, X_test, y_train, y_test = train_test_split(X_except_last_row, y, test_size = 0.20, random_state = 0)
    # Fitting Naive Bayes to the Training set

    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_except_last_row, y)

    import pickle
    # save the classifier
    with open('my_dumped_classifier.pkl', 'wb') as fid:
        pickle.dump(classifier, fid)

    # load it again
    with open('my_dumped_classifier.pkl', 'rb') as fid:
        classifier_loaded = pickle.load(fid)

    y_pred1 = classifier.predict(X_formatted)
    y_pred2 = classifier.predict_log_proba(X_formatted)
    y_pred3 = classifier.predict_proba(X_formatted)
    prediction_list.append(y_pred1)
    prediction_list.append(y_pred2)
    prediction_list.append(y_pred3)

    # Making the Confusion Matrix
    #from sklearn.metrics import confusion_matrix
    #cm = confusion_matrix(y_test, y_pred1)

    name_entity = ""
    Logger("ypred 1 0 === " + str(y_pred1[0]))
    if y_pred1[0] == 2:

        def preprocess(sent):
            sent = nltk.word_tokenize(sent)
            sent = nltk.pos_tag(sent)
            return sent

        sent2 = preprocess(data)
        for x in range(len(sent2)):
            if sent2[x][1] == "NNP":
                if sent2[x][0] != " ":
                    name_entity = sent2[x][0]

        if name_entity == " ":
            prediction_list.append(
                "<br>NER couldn't recognize the entity. :( <br>Traceback Array<br>"
                + str(sent2) + "<br>" + str(name_entity))
        else:
            prediction_list.append(str(name_entity))

        Logger("prediction list if " + str(sent2))

        Logger("nltk entity ----> " + str(sent2))
    else:
        prediction_list.append("<!_name>")
        Logger("prediction list else " + str(prediction_list))
    Logger("prediction list count " + str(len(prediction_list)))

    return prediction_list
class GNB(object):
  '''This class is the doing the actual work in the following steps:
     * define smaller data frames: database, man_add, transform
     * split the data into training and test set
     * setup and run a grid search for best paramaters to define a SVM
     * create a new SVM with best parameters
     * predict on this new SVM with test data and cross-validated training data
     * analyse the predisctions with graphs and stats
  '''
  def __init__(self, metrix, newdata_minusEP, bbbb):
    self.metrix=metrix
    self.newdata_minusEP=newdata_minusEP
    self.prepare_metrix_data()
    self.split_data()
    self.grid_search()
    self.gnb_best_params()
    self.predict()
    self.analysis()

  ###############################################################################
  #
  #  creating 3 data frames specific to the three development milestones I had
  #  1--> directly from data processing
  #  2--> after adding protein information
  #  3--> carrying out some further column transformations
  #
  ###############################################################################

  def prepare_metrix_data(self):
    '''Function to create smaller dataframes for directly after dataprocessing, after
       adding some protein information and after carrying out some custom solumn
       transformations.
    ******
    Input: large data frame
    Output: smaller dataframes; database, man_add, transform
    '''
    print('*' *80)
    print('*    Preparing input dataframes metrix_database, metrix_man_add, metrix_transform')
    print('*' *80)

    #database plus manually added data
    attr_newdata_initial = ['IoverSigma', 'cchalf', 'RmergediffI', 'RmergeI', 'RmeasI',
                      'RmeasdiffI', 'RpimdiffI', 'RpimI', 'totalobservations',
                      'totalunique', 'multiplicity', 'completeness', 'lowreslimit',
                      'highreslimit', 'wilsonbfactor', 'anomalousslope',
                      'anomalousCC', 'anomalousmulti', 'anomalouscompl', 'diffI',
                      'diffF', 'wavelength', 'sg_number', 'cell_a', 'cell_b', 'cell_c',
                      'cell_alpha', 'cell_beta', 'cell_gamma', 'Vcell', 'solvent_content',
                      'Matth_coeff', 'No_atom_chain', 'No_mol_ASU',
                      'MW_chain', 'sites_ASU']

    attr_newdata_transform = ['IoverSigma', 'cchalf', 'RmergediffI', 'RmergeI', 'RmeasI',
                      'RmeasdiffI', 'RpimdiffI', 'RpimI', 'totalobservations',
                      'totalunique', 'multiplicity', 'completeness', 'lowreslimit',
                      'highreslimit', 'wilsonbfactor', 'anomalousslope',
                      'anomalousCC', 'anomalousmulti', 'anomalouscompl', 'diffI',
                      'diffF', 'wavelength', 'wavelength**3', 'wavelength**3/Vcell',
                      'sg_number', 'cell_a', 'cell_b', 'cell_c', 'cell_alpha',
                      'cell_beta', 'cell_gamma','Vcell', 'solvent_content',
                      'Vcell/Vm<Ma>', 'Matth_coeff', 'MW_ASU/sites_ASU/solvent_content',
                      'MW_chain', 'No_atom_chain', 'No_mol_ASU', 'MW_ASU', 'sites_ASU',
                      'MW_ASU/sites_ASU', 'MW_chain/No_atom_chain', 'wilson', 'bragg',
                      'volume_wilsonB_highres', 'IoverSigma/MW_ASU']
                      

    metrix_newdata_initial = self.metrix[attr_newdata_initial]
    self.X_newdata_initial = metrix_newdata_initial

    metrix_newdata_transform = metrix_newdata_initial.copy()
    
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Preparing input data as metrix_transform with following attributes %s \n' %(attr_newdata_initial))
    
    #column transformation
    #MW_ASU
    metrix_newdata_transform['MW_ASU'] = metrix_newdata_transform['MW_chain'] * metrix_newdata_transform['No_mol_ASU']

    #MW_ASU/sites_ASU
    metrix_newdata_transform['MW_ASU/sites_ASU'] = metrix_newdata_transform['MW_ASU'] / metrix_newdata_transform['sites_ASU']
    
    #IoverSigma/MW_ASU
    metrix_newdata_transform['IoverSigma/MW_ASU'] = metrix_newdata_transform['IoverSigma'] / metrix_newdata_transform['MW_ASU']

    #MW_chain/No_atom_chain
    metrix_newdata_transform['MW_chain/No_atom_chain'] = metrix_newdata_transform['MW_chain'] / metrix_newdata_transform['No_atom_chain']

    #MW_ASU/sites_ASU/solvent_content
    metrix_newdata_transform['MW_ASU/sites_ASU/solvent_content'] = metrix_newdata_transform['MW_ASU/sites_ASU'] / metrix_newdata_transform['solvent_content']

    #wavelength**3
    metrix_newdata_transform['wavelength**3'] = metrix_newdata_transform['wavelength'] ** 3

    #wavelenght**3/Vcell
    metrix_newdata_transform['wavelength**3/Vcell'] = metrix_newdata_transform['wavelength**3'] / metrix_newdata_transform['Vcell']

    #Vcell/Vm<Ma>
    metrix_newdata_transform['Vcell/Vm<Ma>'] = metrix_newdata_transform['Vcell'] / (metrix_newdata_transform['Matth_coeff'] * metrix_newdata_transform['MW_chain/No_atom_chain'])

    #wilson
    metrix_newdata_transform['wilson'] = -2 * metrix_newdata_transform['wilsonbfactor']

    #bragg
    metrix_newdata_transform['bragg'] = (1 / metrix_newdata_transform['highreslimit'])**2

    #use np.exp to work with series object
    metrix_newdata_transform['volume_wilsonB_highres'] = metrix_newdata_transform['Vcell/Vm<Ma>'] * np.exp(metrix_newdata_transform['wilson'] * metrix_newdata_transform['bragg'])
    
    self.X_newdata_transform = metrix_newdata_transform
    
    #self.X_newdata_transform.to_csv(os.path.join(self.newdata, 'transformed_dataframe.csv'))
    
    #np.isnan(self.X_newdata_transform)
    #print(np.where(np.isnan(self.X_newdata_transform)))
    #self.X_newdata_transform = np.nan_to_num(self.X_newdata_transform)
    self.X_newdata_transform = self.X_newdata_transform.fillna(0)

    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Created the following dataframes: metrix_transform \n')
      text_file.write(str(self.X_newdata_transform.columns)+'\n')

    ###############################################################################
    #
    #  creating training and test set for each of the 3 dataframes
    #
    ###############################################################################

  def split_data(self):
    '''Function which splits the input data into training set and test set.
    ******
    Input: a dataframe that contains the features and labels in columns and the samples
          in rows
    Output: sets of training and test data with an 80/20 split; X_train, X_test, y_train,
            y_test
    '''
    print('*' *80)
    print('*    Splitting data into test and training set with test=20%')
    print('*' *80)

    y = self.metrix['EP_success']

#normal split of samples    
#    X_transform_train, X_transform_test, y_train, y_test = train_test_split(self.X_transform, y, test_size=0.2, random_state=42)

#stratified split of samples
    X_newdata_transform_train, X_newdata_transform_test, y_train, y_test = train_test_split(self.X_newdata_transform, y, test_size=0.2, random_state=42, stratify=y)
    
    assert self.X_newdata_transform.columns.all() == X_newdata_transform_train.columns.all()
    
#    scaler = StandardScaler()
#    scaler.fit(X_newdata_transform_train)
#    X_newdata_transform_train_scaled = scaler.transform(X_newdata_transform_train)
#    X_newdata_transform_test_scaled = scaler.transform(X_newdata_transform_test)
    
    self.X_newdata_transform_train = X_newdata_transform_train
    self.X_newdata_transform_test = X_newdata_transform_test
    self.y_train = y_train
    self.y_test = y_test
    
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Spliting into training and test set 80-20 \n')
      text_file.write('metrix_transform: X_transform_train, X_transform_test \n')
      text_file.write('y(EP_success): y_train, y_test \n')

    ###############################################################################
    #
    #  grid search for best parameter combination
    #
    ###############################################################################

  def grid_search(self):
    '''running a randomized search to find the parameter combination for a SVM
     which gives the best accuracy score'''
    print('*' *80)
    print('*    Running GridSearch for best parameter combination for SVM')
    print('*' *80)

    gnb = GaussianNB(priors=None, var_smoothing=1e-09)
    
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Created Gaussian Naive Bayes: gnb \n')

    #set up grid search
    param_rand = {'var_smoothing': uniform(0.000000000001, 10.0)}


    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Running grid search for the following parameters: %s \n' %param_rand)
      text_file.write('use cv=3, scoring=accuracy \n')

    #building and running the grid search
    rand_search = RandomizedSearchCV(gnb, param_rand, cv=3, scoring='accuracy', random_state=5, n_iter=500)

    rand_search_transform = rand_search.fit(self.X_newdata_transform_train, self.y_train)
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Best parameters: ' +str(rand_search_transform.best_params_)+'\n')
      text_file.write('Best score: ' +str(rand_search_transform.best_score_)+'\n')
    
    self.best_params_transform = rand_search_transform.best_params_
       
    ###############################################################################
    #
    #  creating new SVM with best parameter combination
    #
    ###############################################################################

  def gnb_best_params(self):
    '''create a new SVM using the best parameter combination found above'''
    print('*' *80)
    print('*    Building new SVM based on best parameter combination and save as pickle')
    print('*' *80)

    self.gnb_best = GaussianNB(**self.best_params_transform)
        
    self.gnb_best.fit(self.X_newdata_transform_train, self.y_train)

    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Probability for each class is: \n' )
      text_file.write(str(self.gnb_best.class_prior_)+'\n')
      text_file.write('Mean for each feature for class 0: \n')
      text_file.write(str(self.gnb_best.theta_[0])+'\n')
      text_file.write('\n')
    class0_feature_ls = self.gnb_best.theta_[0]
    df_0 = pd.DataFrame(class0_feature_ls.reshape(-1, len(class0_feature_ls)), columns=self.X_newdata_transform.columns)
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Feature importances for class 0 \n')
      text_file.write(str(df_0.to_dict(orient='records'))+'\n')
    
    datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
    feature = self.X_newdata_transform.columns
    score_0 = class0_feature_ls 
    x_pos_0 = np.arange(len(class0_feature_ls))
    plt.figure(figsize=(20,10))
    plt.bar(x_pos_0, score_0, align='center')
    plt.xticks(x_pos_0, feature, rotation=90, fontsize=12)
    plt.title('Histogram of Feature Importances for class 0')
    plt.xlabel('Features')
    plt.tight_layout()
    plt.savefig(os.path.join(self.newdata_minusEP, 'feature_importances_overall_bar_plot_class0_'+datestring+'.png'))
    plt.close()
    

    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Probability for each class is: \n' )
      text_file.write(str(self.gnb_best.class_prior_)+'\n')
      text_file.write('Mean for each feature for class 1: \n')
      text_file.write(str(self.gnb_best.theta_[1])+'\n')
      text_file.write('\n')
    class1_feature_ls = self.gnb_best.theta_[1]
    df_1 = pd.DataFrame(class1_feature_ls.reshape(-1, len(class1_feature_ls)), columns=self.X_newdata_transform.columns)
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Feature importances for class 1 \n')
      text_file.write(str(df_1.to_dict(orient='records'))+'\n')
    
    datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
    feature = self.X_newdata_transform.columns
    score_1 = class1_feature_ls 
    x_pos_1 = np.arange(len(class1_feature_ls))
    plt.figure(figsize=(20,10))
    plt.bar(x_pos_1, score_1, align='center')
    plt.xticks(x_pos_1, feature, rotation=90, fontsize=12)
    plt.title('Histogram of Feature Importances for class 1')
    plt.xlabel('Features')
    plt.tight_layout()
    plt.savefig(os.path.join(self.newdata_minusEP, 'feature_importances_overall_bar_plot_class1_'+datestring+'.png'))
    plt.close()
    
    print('Number of training samples observed in each class: ', self.gnb_best.class_count_)
    
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Created Gaussian Naive Bayes: gnb \n')
      
    def write_pickle(svm, directory, name):
      datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
      joblib.dump(svm, os.path.join(directory,'best_gnb_'+name+datestring+'.pkl'))
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Created new GNB "%s" using best parameters \n' %name)
        text_file.write('Creating pickle file for best GNB as best_gnb_%s.pkl \n' %name)
    
    write_pickle(self.gnb_best, self.newdata_minusEP, 'newdata_minusEP')
  
    
    ###############################################################################
    #
    #  Predicting with test set and cross-validation set using the best SVM
    #
    ###############################################################################

  def predict(self):
    '''do predictions using the best SVM an the test set as well as training set with
       10 cross-validation folds and doing some initial analysis on the output'''
    print('*' *80)
    print('*    Predict using new SVM and test/train_CV set')
    print('*' *80)

    #try out how well the classifier works to predict from the test set
    #self.y_pred_transform = self.svc_clf_grid_new_transform.predict(self.X_newdata_transform_test)
    self.y_pred_transform = self.gnb_best.predict(self.X_newdata_transform_test)
    self.y_pred_proba_transform = self.gnb_best.predict_log_proba(self.X_newdata_transform_test)
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Saving predictions and probabilities for X_transform_test in y_pred_transform \n')


    #alternative way to not have to use the test set
    self.y_train_CV_pred_transform = cross_val_predict(self.gnb_best, self.X_newdata_transform_train, self.y_train, cv=3)
    self.y_train_CV_pred_proba_transform = cross_val_predict(self.gnb_best, self.X_newdata_transform_train, self.y_train, cv=3, method='predict_log_proba')
    with open(os.path.join(self.newdata_minusEP, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
      text_file.write('Saving predictions and probabilities for X_transform_train with 3-fold CV in y_train_CV_pred_transform \n')

    print('*' *80)
    print('*    Calculate prediction stats')
    print('*' *80)

    def prediction_stats(y_test, y_pred, directory):
      # calculate accuracy
      y_accuracy = metrics.accuracy_score(self.y_test, y_pred)

      # examine the class distribution of the testing set (using a Pandas Series method)
      class_dist = self.y_test.value_counts()

      # calculate the percentage of ones
      # because y_test only contains ones and zeros, we can simply calculate the mean = percentage of ones
      ones = self.y_test.mean()

      # calculate the percentage of zeros
      zeros = 1 - self.y_test.mean()

      # calculate null accuracy in a single line of code
      # only for binary classification problems coded as 0/1
      null_acc = max(self.y_test.mean(), 1 - self.y_test.mean())

      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Accuracy score or agreement between y_test and y_pred: %s \n' %y_accuracy)
        text_file.write('Class distribution for y_test: %s \n' %class_dist)
        text_file.write('Percent 1s in y_test: %s \n' %ones)
        text_file.write('Percent 0s in y_test: %s \n' %zeros)
        text_file.write('Null accuracy in y_test: %s \n' %null_acc)
    
    prediction_stats(self.y_test, self.y_pred_transform, self.newdata_minusEP)

    ###############################################################################
    #
    #  detailed analysis and stats
    #
    ###############################################################################

  def analysis(self):
    '''detailed analysis of the output:
       * create a confusion matrix
       * split the data into TP, TN, FP, FN for test and train_CV
       * determine accuracy score
       * determine classification error
       * determine sensitivity
       * determine specificity
       * determine false-positive rate
       * determine precision
       * determine F1 score
       calculate prediction probabilities and draw plots
       * histogram for probability to be class 1
       * precision-recall curve
       * look for adjustments in classification thresholds
       * ROC curve
       * determine ROC_AUC
       * try different scoring functions for comparison'''
    print('*' *80)
    print('*    Detailed analysis and plotting')
    print('*' *80)

    def conf_mat(y_test, y_train, y_pred, y_train_pred, directory):
      # IMPORTANT: first argument is true values, second argument is predicted values
      # this produces a 2x2 numpy array (matrix)
      conf_mat_test = metrics.confusion_matrix(y_test, y_pred)
      conf_mat_10CV = metrics.confusion_matrix(y_train, y_train_pred)
      def draw_conf_mat(matrix, directory, name):
        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        labels = ['0', '1']      
        ax = plt.subplot()
        sns.heatmap(matrix, annot=True, ax=ax)
        plt.title('Confusion matrix of the classifier')
        ax.set_xticklabels(labels)
        ax.set_yticklabels(labels)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.savefig(os.path.join(directory, 'confusion_matrix_svm_grid_'+name+datestring+'.png'))
        plt.close()

      draw_conf_mat(conf_mat_test, directory, 'test_')
      draw_conf_mat(conf_mat_10CV, directory, 'train_CV_')
      
      TP = conf_mat_test[1, 1]
      TN = conf_mat_test[0, 0]
      FP = conf_mat_test[0, 1]
      FN = conf_mat_test[1, 0]
      
      TP_CV = conf_mat_10CV[1, 1]
      TN_CV = conf_mat_10CV[0, 0]
      FP_CV = conf_mat_10CV[0, 1]
      FN_CV = conf_mat_10CV[1, 0]

      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('confusion matrix using test set: %s \n' %conf_mat_test)
        text_file.write('confusion matrix using 10-fold CV: %s \n' %conf_mat_10CV)
        text_file.write('Slicing confusion matrix for test set into: TP, TN, FP, FN \n')
        text_file.write('Slicing confusion matrix for 10-fold CV into: TP_CV, TN_CV, FP_CV, FN_CV \n')
      
      #calculate accuracy
      acc_score_man_test = (TP + TN) / float(TP + TN + FP + FN)
      acc_score_sklearn_test = metrics.accuracy_score(y_test, y_pred)
      acc_score_man_CV = (TP_CV + TN_CV) / float(TP_CV + TN_CV + FP_CV + FN_CV)
      acc_score_sklearn_CV = metrics.accuracy_score(y_train, y_train_pred)  
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Accuracy score: \n')
        text_file.write('accuracy score manual test: %s \n' %acc_score_man_test)
        text_file.write('accuracy score sklearn test: %s \n' %acc_score_sklearn_test)
        text_file.write('accuracy score manual CV: %s \n' %acc_score_man_CV)
        text_file.write('accuracy score sklearn CV: %s \n' %acc_score_sklearn_CV)
        
      #classification error
      class_err_man_test = (FP + FN) / float(TP + TN + FP + FN)
      class_err_sklearn_test = 1 - metrics.accuracy_score(y_test, y_pred)
      class_err_man_CV = (FP_CV + FN_CV) / float(TP_CV + TN_CV + FP_CV + FN_CV)
      class_err_sklearn_CV = 1 - metrics.accuracy_score(y_train, y_train_pred)
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Classification error: \n')  
        text_file.write('classification error manual test: %s \n' %class_err_man_test)
        text_file.write('classification error sklearn test: %s \n' %class_err_sklearn_test)
        text_file.write('classification error manual CV: %s \n' %class_err_man_CV)
        text_file.write('classification error sklearn CV: %s \n' %class_err_sklearn_CV)
        
      #sensitivity/recall/true positive rate; correctly placed positive cases  
      sensitivity_man_test = TP / float(FN + TP)
      sensitivity_sklearn_test = metrics.recall_score(y_test, y_pred)
      sensitivity_man_CV = TP_CV / float(FN_CV + TP_CV)
      sensitivity_sklearn_CV = metrics.recall_score(y_train, y_train_pred)
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Sensitivity/Recall/True positives: \n')
        text_file.write('sensitivity manual test: %s \n' %sensitivity_man_test)
        text_file.write('sensitivity sklearn test: %s \n' %sensitivity_sklearn_test)
        text_file.write('sensitivity manual CV: %s \n' %sensitivity_man_CV)
        text_file.write('sensitivity sklearn CV: %s \n' %sensitivity_sklearn_CV)
      
      #specificity  
      specificity_man_test = TN / (TN + FP)
      specificity_man_CV = TN_CV / (TN_CV + FP_CV)
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Specificity: \n')
        text_file.write('specificity manual test: %s \n' %specificity_man_test)
        text_file.write('specificity manual CV: %s \n' %specificity_man_CV)
      
      #false positive rate  
      false_positive_rate_man_test = FP / float(TN + FP)
      false_positive_rate_man_CV = FP_CV / float(TN_CV + FP_CV)
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('False positive rate or 1-specificity: \n')
        text_file.write('false positive rate manual test: %s \n' %false_positive_rate_man_test)
        text_file.write('1 - specificity test: %s \n' %(1 - specificity_man_test))
        text_file.write('false positive rate manual CV: %s \n' %false_positive_rate_man_CV)
        text_file.write('1 - specificity CV: %s \n' %(1 - specificity_man_CV))
      
      #precision/confidence of placement  
      precision_man_test = TP / float(TP + FP)
      precision_sklearn_test = metrics.precision_score(y_test, y_pred)
      precision_man_CV = TP_CV / float(TP_CV + FP_CV)
      precision_sklearn_CV = metrics.precision_score(y_train, y_train_pred)
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Precision or confidence of classification: \n')
        text_file.write('precision manual: %s \n' %precision_man_test)
        text_file.write('precision sklearn: %s \n' %precision_sklearn_test)
        text_file.write('precision manual CV: %s \n' %precision_man_CV)
        text_file.write('precision sklearn CV: %s \n' %precision_sklearn_CV)
      
      #F1 score; uses precision and recall  
      f1_score_sklearn_test = f1_score(y_test, y_pred)
      f1_score_sklearn_CV = f1_score(y_train, y_train_pred)
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('F1 score: \n')
        text_file.write('F1 score sklearn test: %s \n' %f1_score_sklearn_test)
        text_file.write('F1 score sklearn CV: %s \n' %f1_score_sklearn_CV)
        
    conf_mat(self.y_test, self.y_train, self.y_pred_transform, self.y_train_CV_pred_transform, self.newdata_minusEP)
   
    def prediction_probas(svm, X_train, y_train, X_test, y_test, y_pred_proba, y_train_CV_pred_proba, directory, kind): 
      datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')      
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Plotting histogram for y_pred_proba_train_CV \n')
        text_file.write('Plotting histogram for y_pred_proba_test \n')
   
      #plot histograms of probabilities  
      def plot_hist_pred_proba(y_pred_proba, name, directory):
        plt.hist(y_pred_proba, bins=20)
        plt.xlim(0,1)
        plt.title('Histogram of predicted probabilities for y_pred_proba_%s to be class 1' %name)
        plt.xlabel('Predicted probability of EP_success')
        plt.ylabel('Frequency')
        plt.savefig(os.path.join(directory, 'hist_pred_proba_svm_grid_'+name+datestring+'.png'))
        plt.close()

      plot_hist_pred_proba(y_train_CV_pred_proba[:, 1], 'train_CV_', directory)
      plot_hist_pred_proba(y_pred_proba[:, 1], 'test_', directory)
      
      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Getting y_scores for y_pred_proba_train_CV and y_pred_proba_test as y_scores_train_CV and y_scores_test for class 0 and 1\n')

      self.y_scores_ones = y_pred_proba[:, 1]#test data to be class 1
      self.y_scores_CV_ones = y_train_CV_pred_proba[:, 1]#training data to be class 1

      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Plotting Precision-Recall for y_test and y_scores_test \n')
        text_file.write('Plotting Precision-Recall for y_train and y_scores_train_CV \n')
      
      #plot precision and recall curve
      def plot_precision_recall_vs_threshold(precisions, recalls, thresholds_svm, name, classes, directory):
        plt.plot(thresholds_svm, precisions[:-1], "b--", label="Precision")
        plt.plot(thresholds_svm, recalls[:-1], "g--", label="Recall")
        plt.title('Precsion-Recall plot for for EP_success classifier using %s set to be class %s' %(name, classes))
        plt.xlabel("Threshold")
        plt.legend(loc="upper left")
        plt.ylim([0,1])
        plt.savefig(os.path.join(directory, 'Precision_Recall_svm_grid_'+name+datestring+classes+'.png'))
        plt.close()

     #plot Precision Recall Threshold curve for test set        
      precisions, recalls, thresholds_svm = precision_recall_curve(self.y_test, self.y_scores_ones)
      plot_precision_recall_vs_threshold(precisions, recalls, thresholds_svm, 'test_', '1', directory)
      #plot Precision Recall Threshold curve for CV train set       
      precisions, recalls, thresholds_svm = precision_recall_curve(self.y_train, self.y_scores_CV_ones)
      plot_precision_recall_vs_threshold(precisions, recalls, thresholds_svm, 'train_CV_', '1', directory)

      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('Plotting ROC curve for y_test and y_scores_test \n')
        text_file.write('Plotting ROC curve for y_train and y_scores_train_CV \n')

      #IMPORTANT: first argument is true values, second argument is predicted probabilities
      #we pass y_test and y_pred_prob
      #we do not use y_pred, because it will give incorrect results without generating an error
      #roc_curve returns 3 objects fpr, tpr, thresholds
      #fpr: false positive rate
      #tpr: true positive rate
    
      #plot ROC curves
      def plot_roc_curve(y_test, y_proba, name, directory):
        skplt.metrics.plot_roc(y_test, y_proba, title='ROC curve %s' %name)
        plt.savefig(os.path.join(directory, 'ROC_curve_skplt_svm_rand_'+name+datestring+'.png'))
        plt.close()
        
      plot_roc_curve(self.y_train, y_train_CV_pred_proba, 'train_CV_', directory)  
      plot_roc_curve(self.y_test, y_pred_proba, 'test_', directory)  
    
      def plot_roc_curve(fpr, tpr, name, classes, directory):
        plt.plot(fpr, tpr, linewidth=2)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.axis([0, 1, 0, 1])
        plt.title('ROC curve for EP_success classifier using %s set for class %s' %(name, classes)) 
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.grid(True)
        plt.savefig(os.path.join(directory, 'ROC_curve_svm_grid_'+name+datestring+classes+'.png'))
        plt.close()
        
      #ROC curve for test set      
      fpr_1, tpr_1, thresholds_1 = roc_curve(self.y_test, self.y_scores_ones)
      plot_roc_curve(fpr_1, tpr_1, 'test_', '1', directory)
      #ROC curve for 10-fold CV train set      
      fpr_CV_1, tpr_CV_1, thresholds_CV_1 = roc_curve(self.y_train, self.y_scores_CV_ones)
      plot_roc_curve(fpr_CV_1, tpr_CV_1, 'train_CV_', '1', directory)
      
      #calculate the area under the curve to get the performance for a classifier
      # IMPORTANT: first argument is true values, second argument is predicted probabilities
      AUC_test_class1 = metrics.roc_auc_score(self.y_test, self.y_scores_ones)
      AUC_train_class1 = metrics.roc_auc_score(self.y_train, self.y_scores_CV_ones)

      with open(os.path.join(directory, 'gaussianNB_randomsearch.txt'), 'a') as text_file:
        text_file.write('AUC for test set class 1: %s \n' %AUC_test_class1)
        text_file.write('AUC for CV train set class 1: %s \n' %AUC_train_class1)

    prediction_probas(self.gnb_best, self.X_newdata_transform_train, self.y_train, self.X_newdata_transform_test, self.y_test, self.y_pred_proba_transform, self.y_train_CV_pred_proba_transform, self.newdata_minusEP, 'newdata_minusEP')    
Example #35
0
### Prediction by Naive Bayes ###
#################################

test_data = ["A very close game"]

vectorizer = CountVectorizer()
vec_test = vectorizer.fit_transform(test_data).toarray()
vec_test
vectorizer.get_feature_names()

df_tmp = df_X.drop(range(0,len(df_X)))
df_test = pd.DataFrame(data=vec_test, columns=vectorizer.get_feature_names(), index=["test"])
df_test = pd.concat([df_tmp, df_test]).fillna(0).astype('int')
df_test

# Prediction
clf.predict(df_test)
clf.predict_log_proba(df_test)
#clf.predict_proba(df_test)

#%%
clf.predict_log_proba(X)

#%%
clf.predict_proba(X)

#%%
clf.predict(X)

#%%
#clf.get_params()
Example #36
0
clf = GaussianNB()      # 默认priors=None
temp = clf.partial_fit(X, y, classes=[1, 2],
                       sample_weight=np.array([0.05, 0.05, 0.1, 0.1,
                                               0.1, 0.2, 0.2, 0.2]))
print('clf.partial_fit: ', temp)
print(' clf.class_prior_: ',  clf.class_prior_ )

# predict(X):直接输出测试集预测的类标记
temp = clf.predict([[-6,-6],[4,5]])
print('clf.predict([[-6,-6],[4,5]]) is ', temp)
print('')

# predict_proba(X):输出测试样本在各个类标记预测概率值
temp = clf.predict_proba([[-6,-6],[4,5]])
print('clf.predict_proba([[-6,-6],[4,5]]) is ', '\n', temp)

# predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值
temp = clf.predict_log_proba([[-6,-6],[4,5]])
print('clf.predict_log_proba([[-6,-6],[4,5]]) is ', '\n', temp)

print('')

# score(X, y, sample_weight=None):返回测试样本映射到指定类标记上的得分(准确率)
temp = clf.score([[-6, -6], [-4, -2], [-3, -4], [4, 5]], [1, 1, 2, 2])
print('clf.score is ', temp)

temp = clf.score([[-6, -6], [-4, -2], [-3, -4], [4, 5]], [1, 1, 2, 2],
                 sample_weight=[0.3, 0.2, 0.4,0.1])
print('clf.score with weight is ', temp)

Example #37
0
              [3, 3]])
y = np.array([1, 1, 1, 1, 1, 2, 2, 2])
clf = GaussianNB(priors=[0.625, 0.375])  #默认priors=None
clf.fit(X, y, sample_weight=None)  #训练样本,X表示特征向量,y类标记,sample_weight表各样本权重数组
print(clf.class_prior_)  #priors属性:获取各个类标记对应的先验概率
print(clf.priors)  #class_prior_属性:同priors一样,
print(clf.class_count_)  #class_count_属性:获取各类标记对应的训练样本数
print(clf.theta_)  #theta_属性:获取各个类标记在各个特征上的均值
print(clf.sigma_)  #sigma_属性:获取各个类标记在各个特征上的方差
print(clf.get_params(deep=True))  #get_params(deep=True):返回priors与其参数值组成字典
clf.set_params(priors=[0.6, 0.4])  #set_params(**params):设置估计器priors参数
print(clf.get_params(deep=True))
print(clf.predict([[-6, -6], [4, 5]]))  #预测样本分类
print(clf.predict_proba([[-6, -6], [4,
                                    5]]))  #predict_proba(X):输出测试样本在各个类标记预测概率值
print(clf.predict_log_proba([[-6, -6], [4, 5]
                             ]))  #predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值
print(clf.score(
    [[-6, -6], [-4, -2], [-3, -4], [4, 5]],
    [1, 1, 2, 2]))  #score(X, y, sample_weight=None):返回测试样本映射到指定类标记上的平均得分(准确率)

# output:
# [0.625 0.375]
# [0.625, 0.375]
# [5. 3.]
# [[-3. -3.]
#  [ 2.  2.]]
# [[2.00000001 2.00000001]
#  [0.66666667 0.66666667]]
# {'priors': [0.625, 0.375]}
# {'priors': [0.6, 0.4]}
# [1 2]
Example #38
0
class MixedNB(BaseNB):
    """
    Implementation of Naive Bayes for datasets with continuous and
    categorical attributes

    1. separate categorical and continuous attributes into individual datasets
    2. process categorical attributes accordingly to MultinomialNB
    3. process continuous attributes accordingly to GaussianNB
    """
    def __init__(self):
        self.mNB = CategoricalNB()  #MultinomialNB()
        self.gNB = GaussianNB()

    def _joint_log_likelihood(self, X):
        """
        _joint_log_likelihood for both GaussianNB and MultinomialNB return a numpy array of size (n_samples, n_class)
        that represent posterior. This was calculated using the following formula:
        
        ln(posterior_multinomial) = ln(class prior) + ln(likelihood_categorical)
        
        ln(posterior_Gaussian) = ln(class prior) + ln(likelihood_continuous)
        
        *** ASSUMPTION ***
        assuming that ln(class prior) is the same for both categorical and continuous attributes, we can calculate the
        posterior probability for a dataset of mixed continuous and categorical attributes as follows:
        
        *** JUSTIFYING ASSUMPTION ***
        is class prior for MultinomialNB the same as class prior for GaussianNB???
        
        GaussianNB class_prior is simple: gives the probability of each class (n_class)
        
        self.class_prior_ = self.class_count_ / self.class_count_.sum()

        Multinomial the value of class_log_prior depends on input parameters
        
        if class_prior is given, then class_log_prior is log(class_prior)
        
        if fit_prior is True (default), then class_log_prior will be the same as GaussianNB but with log applied
        
        otherwise, class_log_prior will be a uniform value.

        posterior_mixed = class_prior * likelihood_mixed

                        = class_prior * likelihood_categorical * likelihood_continuous

                        = class_prior * likelihood_categorical * class_prior * likelihood_continuous / class_prior

                        = posterior_multinomial * posterior_continuous / class_prior

        ln(posterior_mixed) = ln(posterior_multinomial) + ln(posterior_continuous) - ln(class_prior)


        :param X:
        :return:
        """

        categorical_x = cat2cont(X.select_dtypes(include=['object']))
        continuous_x = X.select_dtypes(include=np.number)

        joint_log_prob_categorical = self.mNB.predict_log_proba(categorical_x)
        joint_log_prob_continuous = self.gNB.predict_log_proba(continuous_x)

        return joint_log_prob_categorical + joint_log_prob_continuous - self.mNB.class_log_prior_

    def fit(self, X, y, sample_weight=None):
        # split X into categorical and continuous
        X_categorical = X.select_dtypes(include=['object'])
        X_continuous = X.select_dtypes(include=np.number)

        X_categorical = cat2cont(X_categorical)
        self.classes_ = np.unique(y)
        self.mNB.fit(X_categorical, y)
        self.gNB.fit(X_continuous, y)