def model_gaussian_rand_var_train(self, data, truth): mus = {} std_dev = {} for label in [0,1]: sub_data = hw3.get_sub_at_value(data, truth, label) mus[label] = hw3.get_mus(sub_data) std_dev[label] = hw3.get_std_dev(sub_data) self.y_prob = float(sum(truth))/len(truth) return [mus, std_dev, float(sum(truth))/len(truth)]
def model_average_predict(self, data_row, theta=.5): """ For each row calculate the probability that y is 1 and the probability that y is 0 P(Y|X) = ( P(X|Y) * P(Y) ) / ( P(X) ) P(X) = prob_over (probability that x is above average for column) P(X|Y) = prob_over_given_c (probability that x is above average when y = c for column) P(Y) = prob_y ( probability of y ) """ mus = hw3.get_mus(data_row) data_cols = hw3.transpose_array(data_row) prob_over_given_1 = self.model[0] prob_over_given_0 = self.model[1] prob_over = self.model[2] prob_y1 = self.model[3] predict = [] for r in range(len(data_row)): row = data_row[r] prob_1 = 1 prob_0 = 1 for c in range(len(row)): mu = mus[c] if row[c] > mu: prob_x1 = prob_over_given_1[c] prob_x0 = prob_over_given_0[c] prob_xover = prob_over[c] else: prob_x1 = 1 - prob_over_given_1[c] prob_x0 = 1 - prob_over_given_0[c] prob_xover = 1 - prob_over[c] prob_1 = prob_1 * prob_x1 #* prob_y1 #/ prob_xover #P(X|Y) * P(Y) prob_0 = prob_0 * prob_x0 #* (1-prob_y1) #/ prob_xover #prob_1 = prob_1 + np.log(prob_x1) + np.log(prob_y1) #prob_0 = prob_0 + np.log(prob_x0) + np.log(1-prob_y1) prob_1 = prob_1 * prob_y1 prob_0 = prob_0 * (1 - prob_y1) prob_norm = float(prob_1)/(prob_0 + prob_1) if prob_norm > theta: predict.append(1) else: predict.append(0) return predict
def model_average_train(self, data_row, truth): """ return [prob_over_given_1, prob_over_given_0, prob_y1] prob_over_give_x = col1[mu, var, proabality], colx[mu, var, prob] ... """ mus = hw3.get_mus(data_row) is_not_spam = hw3.get_sub_at_value(data_row, truth, 0) is_spam = hw3.get_sub_at_value(data_row, truth, 1) prob_over = get_prob_over(data_row, mus) prob_over_given_1 = get_prob_over(is_spam, mus) prob_over_given_0 = get_prob_over(is_not_spam, mus) l0 = len(prob_over_given_0) l1 = len(prob_over_given_1) if l1 != l0: addx = abs(l1-l0) fake_row = [0 for _ in range(addx)] if l1 > l0: prob_over_given_0 = fake_row else: prob_over_given_1 = fake_row prob_y1 = float(sum(truth))/len(truth) self.y_prob = prob_y1 return [prob_over_given_1, prob_over_given_0, prob_over, prob_y1]
def test_GDA(): arr = get_test_data() print arr covar = hw3.get_covar(arr, arr) print hw3.GDA(arr, hw3.get_mus(arr), covar)
def test_get_mus(): arr = get_test_data() print arr print hw3.get_mus(arr)