def model_bin_train(self, data_row, truth, num_bins=2): #TODO add epsilon model = {} cutoffsc = [[] for _ in range(len(data_row[0]))] dmat = np.matrix(data_row) drange = dmat.max() - dmat.min() bin_size = float(drange) / num_bins data_col = hw3.transpose_array(data_row) for j in range(len(data_col)): #cutoffsc.append([min(data_col)[0] + bin_size * i for i in range(num_bins)]) mu = np.asarray(data_col[j]).mean() low_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] < mu]).mean() high_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] > mu]).mean() if num_bins == 4: cutoffsc[j] = [min(data_col)[0], low_mu, mu, high_mu] else: cutoffsc[j] = [min(data_col)[0], (low_mu - min(data_col)[0])/2, mu, (high_mu-mu)/2, high_mu, (max(data_col)[0]-high_mu)/2] cutoffs = [dmat.min() + bin_size * i for i in range(num_bins)] #epsilon = float(alpha * 1) / len(covar_matrix) for label in [0,1]: # transpose to go by column sub_data = hw3.transpose_array(hw3.get_sub_at_value(data_row, truth, label)) model[label] = hw3.bins_per_column(sub_data, cutoffs) model[label] = hw3.bins_per_column_by_col(sub_data, cutoffsc) # probability of bin given label self.y_prob = float(sum(truth))/len(truth) self.cutoffs = cutoffsc return model
def model_gaussian_rand_var_train(self, data, truth): mus = {} std_dev = {} for label in [0,1]: sub_data = hw3.get_sub_at_value(data, truth, label) mus[label] = hw3.get_mus(sub_data) std_dev[label] = hw3.get_std_dev(sub_data) self.y_prob = float(sum(truth))/len(truth) return [mus, std_dev, float(sum(truth))/len(truth)]
def model_average_train(self, data_row, truth): """ return [prob_over_given_1, prob_over_given_0, prob_y1] prob_over_give_x = col1[mu, var, proabality], colx[mu, var, prob] ... """ mus = hw3.get_mus(data_row) is_not_spam = hw3.get_sub_at_value(data_row, truth, 0) is_spam = hw3.get_sub_at_value(data_row, truth, 1) prob_over = get_prob_over(data_row, mus) prob_over_given_1 = get_prob_over(is_spam, mus) prob_over_given_0 = get_prob_over(is_not_spam, mus) l0 = len(prob_over_given_0) l1 = len(prob_over_given_1) if l1 != l0: addx = abs(l1-l0) fake_row = [0 for _ in range(addx)] if l1 > l0: prob_over_given_0 = fake_row else: prob_over_given_1 = fake_row prob_y1 = float(sum(truth))/len(truth) self.y_prob = prob_y1 return [prob_over_given_1, prob_over_given_0, prob_over, prob_y1]
def initialize(self, data, k=2): # start with k = 2 and std_dev = 1 self.k = k self.labels = [ki for ki in range(self.k)] models = [EMModel() for _ in range(self.k)] mucheat = mu_cheat(hw3.transpose_array(data), k) for ki in range(self.k): #models[ki].random_mus(data) models[ki].mu = mucheat[ki] self.labels = self.assign_labels(data, models) #self.labels = self.assign_labels2(data, model) self.prevent_empty(data) for ki in range(self.k): sub_data = hw3.get_sub_at_value(data, self.labels, ki) #models[ki].sigma = hw3.get_covar(sub_data) models[ki].sigma = hw3.get_covar(data) #models[ki].weight = float(len(sub_data)) / len(data) models[ki].weight = .5 models[ki].likelihood = self.expectation(data, models[ki]) # multivarate_normal self.models = models