/
naive_bayes.py
136 lines (90 loc) · 3.76 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding: utf-8 -*-
#author: ANDY
import numpy as np
import random
from sklearn import cross_validation
class naive_bayes:
def __init__(self,trainx,trainy,testx,testy,n,a,c):
self.n=n#the number of x
self.c=c#c==[0,1]
self.a=a#the features of x
self.len_a=len(a)
self.len_c=len(c)
#define x and y
self.trainx,self.trainy,self.testx,self.testy=trainx,trainy,testx,testy
#read data from input
def read(self,filename):
with open(filename,'r')as f:
x,y=f.readlines()
trainx,trainy,testx,testy=cross_validation(x,y)
return trainx,trainy,testx,testy
#calculate the probability
def cal_probab(m,n):
return m/n
#calculate the occurrence numbers of all possible probability events
def cal_occur(self,x,y):
#define a matrix to store the occurrence numbers
countedlist=np.zeros(self.len_c,self.len_a)
count=0
for feature in self.a:
for i in range(self.len_a):
if feature==x[i] and y[i]==0:
countedlist[0][count]+=1
else:
if feature==x[i] and y[i]==1:
countedlist[1][count]+=1
count+=1
return countedlist
# train the data
def train(self):
posteriori_event=self.cal_occur(self.trainx,self.trainy)
y_event=np.zeros(self.len_c)
#calculate the number of each y events
sum_event=0
for i in range(self.len_c):
for j in range(self.len_a):
y_event[i]+=posteriori_event[i][j]
#calculate the sum of the number of the posteriori events
for i in range(self.len_c):
sum_event+=y_event[i]
#calculate the probability of each y event
y_probab=np.zeros(self.len_c)
for i in range(self.len_c):
y_probab[i]=self.cal_probab(y_event,sum_event)
#calculate the probability of each posteriori event
posteriori_probab=np.zeros(self.len_c,self.len_a)
for i in range(self.len_a):
for j in range(self.len_c):
posteriori_probab[i][j]=self.cal_probab(posteriori_event[i][j],sum_event)
return posteriori_probab,y_probab
def predict(self,testx):
posteriori_probab,y_probab=self.train()
test_probab=np.zeros(self.len_c)
test_probab+=1
test_predict=0
#predix
len_testx=len(testx)
for i in range(len_testx):
for j in range(self.len_a):
if self.testx[i] == self.a[j]:
test_probab[0]=test_probab[0]*posteriori_probab[0][j]*y_probab[0]
test_probab[1]=test_probab[1]*posteriori_probab[1][j]*y_probab[1]
#rank the most probable y and select it as predicted class
if test_probab[0]>test_probab[1]:
test_predict=test_probab[0]
else:
test_predict=test_probab[1]
return test_predict
def score(self):
score=0
len_testx=len(self.testx)
for i in range(len_testx):
testy_predict=self.predict(self.testx[i])
if testy_predict==self.testy[i]:
score+=1
score/=len_testx
return score
def generate_data(n,nc):
pass
if __name__=="__main__":
pass