Exemple #1
0
	def train(self, x_supervised, x_unsupervised, y_supervised):
        
		"""
		train the modified Naive bayes classifier using both labelled and 
		unlabelled data. We use the CountVectorizer vectorizaton method from scikit-learn

		positional arguments:!
		    
		    -- X_supervised: [N_sup, in_features]
		    -- X_unsupervised: [N_unsup, in_features]
		    -- y_supervised: [N_sup, out_class]
		"""


		# clf = GaussianNB()
		clf = BernoulliNB()
		clf.fit(x_supervised, y_supervised)
		

		predi = clf.predict(x_supervised)

		old_likelihood = 1

		while self.max_rounds > 0:
		    
			self.max_rounds -= 1
			# E-step
			predi = clf.predict(x_unsupervised)
			# M-step
			clf.fit(x_unsupervised, predi)
			# calculate new total likelihood
			predi = clf.predict(x_supervised)
			unsupervised_log_matrix = clf._joint_log_likelihood(x_unsupervised)
			supervised_log_matrix = clf._joint_log_likelihood(x_supervised)
			# print("unsupervised_log_matrix before log", unsupervised_log_matrix)
			
			total_likelihood = self.get_log_likelihood(unsupervised_log_matrix, supervised_log_matrix, y_supervised)
			# print("total likelihood: {}".format(total_likelihood))

			if self._stopping_time(old_likelihood, total_likelihood):
			    
				break

			old_likelihood = total_likelihood.copy()
		self.clf = clf
ys = pickle.load(open('binarized_ys.pkl', 'rb'))
print("Done.")
for i in range(0,2):
    x_train, x_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=1./3, random_state=3330)


features=len(x_train)

objects=len(y_train)


clf = BernoulliNB(alpha=0, binarize=0.0 , class_prior=None , fit_prior=True  )

clf = clf.fit(x_train, y_train)

a= clf._joint_log_likelihood(x_train)

print("joint log likelyhood train")

print(a)

res=[]
for i in range(0,objects):
    j=0
    res.append([a[i][j]/(a[i][j]+a[i][j+1]),a[i][j+1]/(a[i][j]+a[i][j+1])])
sum=0
for i in range(0,objects):
    if y_train[1].__eq__(False):
        sum += res[i][0]
    else:
        sum += res[i][1];
#print("Alpha LIST : " ,alphaVal )
# For 10 datasets
for e in range(0, 10):
    X_train, X_test, y_train, y_test = train_test_split(
        Xs[e], ys[e], test_size=1. / 3, random_state=6099)  # A20396099
    # For 15 alpha values
    for alp in range(0, 15):
        # BernoulliNB classifier
        clf = BernoulliNB(alpha=alphaVal[alp],
                          binarize=0.0,
                          fit_prior=True,
                          class_prior=None)
        # fitting model on train data
        clf.fit(X_train, y_train)
        # prediction for train data using jll
        predict_train = clf._joint_log_likelihood(X_train)
        # prediction for test data using jll
        predict_test = clf._joint_log_likelihood(X_test)
        log_train, log_test = 0, 0
        #         print("Train : ", log_train)
        #         print("Train : ", log_train)
        #         print("Predict Train jll: ", predict_train)
        #         print("Predict Test jll: ", predict_test)
        #         summing test predections
        for test in range(len(predict_test)):
            if y_test[test] == True:
                log_test += predict_test[test][1]
            else:
                log_test += predict_test[test][0]
#         summing train predections
        for train in range(len(predict_train)):
test_jll = np.zeros((10, 15))

for i in range(0, 10):
    idx = 0
    # Split datasets
    x_train, x_test, y_train, y_test = train_test_split(Xs[i],
                                                        ys[i],
                                                        test_size=1. / 3,
                                                        random_state=7000)
    for j in alphas:
        # 1. Create new Bernoulli Naive Bayes model using alpha value
        mod = BernoulliNB(alpha=j)
        # Fit the model to the training set
        mod.fit(x_train, y_train)
        # Compute the joint log likelihood for the training set, store it train_jll 2d array
        total_res = mod._joint_log_likelihood(x_train)
        y_train_binary = y_train * 1
        entry_val = 0
        # Sum-up by matching true labels
        for k in range(0, len(y_train)):
            entry_val += total_res[k][y_train_binary[k]]
        # Store result
        train_jll[i][idx] = entry_val
        # 2. Compute the joint log likelihood for the testing set, store it test_jll 2d array
        total_res = mod._joint_log_likelihood(x_test)
        y_test_binary = y_test * 1
        entry_val = 0
        # Sum-up by matching true labels
        for k in range(0, len(y_test)):
            entry_val += total_res[k][y_test_binary[k]]
        test_jll[i][idx] = entry_val
distribution = []
for i in range(-7, 8):
    distribution.append(10**i)

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(Xs[i],
                                                        ys[i],
                                                        test_size=1. / 3,
                                                        random_state=4435)

    for j in range(15):
        classifier = BernoulliNB(alpha=distribution[j])
        classifier.fit(X_train, y_train)

        train_Y_score = classifier._joint_log_likelihood(X_train)
        individual_joint_likelihood = 0.0
        for k in range(0, len(y_train)):
            if y_train[k] == True:
                individual_joint_likelihood += train_Y_score[k][1]
            else:
                individual_joint_likelihood += train_Y_score[k][0]

        train_joint_likelihood[i][j] = individual_joint_likelihood

        test_Y_score = classifier._joint_log_likelihood(X_test)
        individual_joint_likelihood = 0.0
        for k in range(0, len(y_test)):
            if y_test[k] == True:
                individual_joint_likelihood += test_Y_score[k][1]
            else:
Exemple #6
0
from sklearn.naive_bayes import BernoulliNB

Xs = pickle.load(open('binarized_xs.pkl', 'rb'))
ys = pickle.load(open('binarized_ys.pkl', 'rb'))

train_jll = np.zeros((10, 15))
test_jll = np.zeros((10, 15))

for i_dataset in range(10):
    X, y = Xs[i_dataset], ys[i_dataset]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=1. / 3,
                                                        random_state=1527)
    y_train_indices = [0 if i == False else 1 for i in y_train]
    y_test_indices = [0 if i == False else 1 for i in y_test]

    for i_alpha in range(-7, 8):
        clf = BernoulliNB(alpha=10**i_alpha)
        clf.fit(X_train, y_train)
        sum_train_jll, sum_test_jll = 0, 0
        for i in range(len(y_train)):
            sum_train_jll += clf._joint_log_likelihood(X_train)[i][
                y_train_indices[i]]
        for i in range(len(y_test)):
            sum_test_jll += clf._joint_log_likelihood(X_test)[i][
                y_test_indices[i]]
        train_jll[i_dataset][i_alpha + 7] = sum_train_jll
        test_jll[i_dataset][i_alpha + 7] = sum_test_jll

pickle.dump((train_jll, test_jll), open('result.pkl', 'wb'))
Exemple #7
0
train_jll = np.zeros((10, 15))
test_jll = np.zeros((10, 15))
# Anumber A20406657
for i in range(len(Xs)):
    X_train, X_test, y_train, y_test = train_test_split(
        Xs[i], ys[i], test_size=1. / 3, random_state=int("6657"))
    #print(X_train)
    for j in range(len(alpha_list)):
        sum_1 = 0
        sum_2 = 0
        clf = BernoulliNB(alpha=alpha_list[j],
                          binarize=0.0,
                          class_prior=None,
                          fit_prior=True)
        clf.fit(X_train, y_train)
        joint_X_train = clf._joint_log_likelihood(X_train)
        joint_X_test = clf._joint_log_likelihood(X_test)
        for k in range(0, len(joint_X_train)):
            if y_train[k] == True:
                sum_1 += joint_X_train[k][1]
            else:
                sum_1 += joint_X_train[k][0]
            #print(y_train[k])
        for m in range(0, len(joint_X_test)):
            if y_test[m] == True:
                sum_2 += joint_X_test[m][1]
            else:
                sum_2 += joint_X_test[m][0]

        train_jll[i][j] = sum_1
        test_jll[i][j] = sum_2