コード例 #1
0
ファイル: naive_bayes.py プロジェクト: bopopescu/Data-Science
    def fit(self, X, y):
        #if issparse(X):
        #    X = X.todense()
        #if issparse(y):
        #    y = y.todense()
        #X, y = utils.check_fit_input(X, y, binary=True)
        m, n = X.shape

        Y = np.array([1 - y, y])

        # Calculate number of occurances for each feature for y={0, 1}
        # X_count is an array of shape (2, n) where aij is the number of
        # occurances for feature j given y=i.
        X_count = np_dot(Y, X)

        # Calculate total number of feature occurances for y={0, 1}
        # Y_count.shape = (2, )
        Y_count = Y.sum(axis=1).astype(np.float)

        X_count_smoothed = X_count + self.alpha
        Y_count_smoothed = Y_count + self.alpha

        Phi_X = np_dot(np.diag(X_count_smoothed.sum(axis=1)**-1),
                       X_count_smoothed)
        print "X=%s" % X
        print "X_count=%s" % X_count
        print "Y=%s" % Y

        print "Phi_X=%s" % Phi_X
        print np.where(Phi_X <= 0)
        self.Phi_X_log = np.log(Phi_X)
        self.Phi_Y_log = np.log(Y_count_smoothed / Y_count_smoothed.sum())
コード例 #2
0
ファイル: naive_bayes.py プロジェクト: bopopescu/Data-Science
 def predict_proba(self, X):
     m = X.shape[0]
     metric = np.zeros((m, 2))
     # p(X\y = 0) p(y=0) , p(X\y = 1) p(y=1)
     metric[:, 0] = np_dot(1 - X, self.Phi_X_y0_log[:, 0]) + np_dot(
         X, self.Phi_X_y0_log[:, 1]) + self.Phi_Y_log[0]
     metric[:, 1] = np_dot(1 - X, self.Phi_X_y1_log[:, 0]) + np_dot(
         X, self.Phi_X_y1_log[:, 1]) + self.Phi_Y_log[1]
     return metric
コード例 #3
0
ファイル: naive_bayes.py プロジェクト: bopopescu/Data-Science
    def fit(self, X, y):
        X, y = utils.check_fit_input(X, y, binary=True)
        m, n = X.shape

        Y = np.array([1 - y, y])
        # Calculate number of occurances for each feature for y={0, 1}
        # X_count is an array of shape (2, n) where aij is the number of
        # occurances for feature j given y=i.
        X_count = np_dot(Y, X)
        Y_count = Y.sum(axis=1).astype(np.float)

        X_count_smoothed = X_count + self.alpha
        Y_count_smoothed = Y_count + 2 * self.alpha

        Phi_X = np_dot(np.diag(Y_count_smoothed**-1), X_count_smoothed)

        self.Phi_X_y0_log = np.log(np.c_[1 - Phi_X[0, :], Phi_X[0, :]])
        self.Phi_X_y1_log = np.log(np.c_[1 - Phi_X[1, :], Phi_X[1, :]])
        self.Phi_Y_log = np.log((Y_count + 1) / (Y_count.sum() + 2))
コード例 #4
0
ファイル: naive_bayes.py プロジェクト: bopopescu/Data-Science
    def predict_proba(self, X):
        # X.shape = (m, n), Phi_X_log.shape = (2, n)
        log_prob = np_dot(X, self.Phi_X_log.T) + self.Phi_Y_log

        # log probabilities above are not normalized but are
        # enough to make a decision regarding class of y.
        # But for some algorithms such as multi-class classifiers,
        # we need to provide the actual probabilities as well.
        # We use some numerical tricks since directly exponentiating
        # log_prob will lead to numerical problems.
        prob_diff = np.exp(log_prob[:, 0] - log_prob[:, 1])
        prob = np.zeros(log_prob.shape)
        prob[:, 0] = 1 / (1 + prob_diff**-1)
        prob[:, 1] = 1 / (1 + prob_diff)
        return prob