def model_bin_train(self, data_row, truth, num_bins=2): #TODO add epsilon model = {} cutoffsc = [[] for _ in range(len(data_row[0]))] dmat = np.matrix(data_row) drange = dmat.max() - dmat.min() bin_size = float(drange) / num_bins data_col = hw3.transpose_array(data_row) for j in range(len(data_col)): #cutoffsc.append([min(data_col)[0] + bin_size * i for i in range(num_bins)]) mu = np.asarray(data_col[j]).mean() low_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] < mu]).mean() high_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] > mu]).mean() if num_bins == 4: cutoffsc[j] = [min(data_col)[0], low_mu, mu, high_mu] else: cutoffsc[j] = [min(data_col)[0], (low_mu - min(data_col)[0])/2, mu, (high_mu-mu)/2, high_mu, (max(data_col)[0]-high_mu)/2] cutoffs = [dmat.min() + bin_size * i for i in range(num_bins)] #epsilon = float(alpha * 1) / len(covar_matrix) for label in [0,1]: # transpose to go by column sub_data = hw3.transpose_array(hw3.get_sub_at_value(data_row, truth, label)) model[label] = hw3.bins_per_column(sub_data, cutoffs) model[label] = hw3.bins_per_column_by_col(sub_data, cutoffsc) # probability of bin given label self.y_prob = float(sum(truth))/len(truth) self.cutoffs = cutoffsc return model
def testTransposeArray(): dfup = hw3.load_and_normalize_spambase() cols = dfup.columns[0:3] sub = utils.train_subset(dfup, cols, 5) up = hw3.pandas_to_data(sub) print up trans = hw3.transpose_array(up) print trans
def get_prob_over(data_by_row, mus): """ Return array of arrays column[i] = [probability_above] """ probability_above_mu = [] size = len(data_by_row) by_col = hw3.transpose_array(data_by_row) for col in range(len(by_col)): total_over = 0 column = by_col[col] mu_col = mus[col] var_col = utils.variance(by_col[col], size) for row in range(len(column)): if column[row] > mu_col: total_over += 1 probability_above_mu.append(float(total_over)/size) return probability_above_mu
def model_average_predict(self, data_row, theta=.5): """ For each row calculate the probability that y is 1 and the probability that y is 0 P(Y|X) = ( P(X|Y) * P(Y) ) / ( P(X) ) P(X) = prob_over (probability that x is above average for column) P(X|Y) = prob_over_given_c (probability that x is above average when y = c for column) P(Y) = prob_y ( probability of y ) """ mus = hw3.get_mus(data_row) data_cols = hw3.transpose_array(data_row) prob_over_given_1 = self.model[0] prob_over_given_0 = self.model[1] prob_over = self.model[2] prob_y1 = self.model[3] predict = [] for r in range(len(data_row)): row = data_row[r] prob_1 = 1 prob_0 = 1 for c in range(len(row)): mu = mus[c] if row[c] > mu: prob_x1 = prob_over_given_1[c] prob_x0 = prob_over_given_0[c] prob_xover = prob_over[c] else: prob_x1 = 1 - prob_over_given_1[c] prob_x0 = 1 - prob_over_given_0[c] prob_xover = 1 - prob_over[c] prob_1 = prob_1 * prob_x1 #* prob_y1 #/ prob_xover #P(X|Y) * P(Y) prob_0 = prob_0 * prob_x0 #* (1-prob_y1) #/ prob_xover #prob_1 = prob_1 + np.log(prob_x1) + np.log(prob_y1) #prob_0 = prob_0 + np.log(prob_x0) + np.log(1-prob_y1) prob_1 = prob_1 * prob_y1 prob_0 = prob_0 * (1 - prob_y1) prob_norm = float(prob_1)/(prob_0 + prob_1) if prob_norm > theta: predict.append(1) else: predict.append(0) return predict
def initialize(self, data, k=2): # start with k = 2 and std_dev = 1 self.k = k self.labels = [ki for ki in range(self.k)] models = [EMModel() for _ in range(self.k)] mucheat = mu_cheat(hw3.transpose_array(data), k) for ki in range(self.k): #models[ki].random_mus(data) models[ki].mu = mucheat[ki] self.labels = self.assign_labels(data, models) #self.labels = self.assign_labels2(data, model) self.prevent_empty(data) for ki in range(self.k): sub_data = hw3.get_sub_at_value(data, self.labels, ki) #models[ki].sigma = hw3.get_covar(sub_data) models[ki].sigma = hw3.get_covar(data) #models[ki].weight = float(len(sub_data)) / len(data) models[ki].weight = .5 models[ki].likelihood = self.expectation(data, models[ki]) # multivarate_normal self.models = models
def test_covar_matrix(): arr = get_test_data() print arr y = [1, 0, 0, 0] print hw3.get_covar(arr, y) print np.cov(hw3.transpose_array(arr), y)
def set_mus(self, data): mu = [] by_col = hw3.transpose_array(data) # to go by column for j in range(len(by_col)): mu.append(np.mean(by_col[j])) self.mu = mu