def model_bin_train(self, data_row, truth, num_bins=2):
     #TODO add epsilon
     model = {}
     cutoffsc = [[] for _ in range(len(data_row[0]))]
     dmat = np.matrix(data_row)
     drange = dmat.max() - dmat.min()
     bin_size = float(drange) / num_bins
     data_col = hw3.transpose_array(data_row)
     for j in range(len(data_col)):
         #cutoffsc.append([min(data_col)[0] + bin_size * i for i in range(num_bins)])
         mu = np.asarray(data_col[j]).mean()
         low_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] < mu]).mean()
         high_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] > mu]).mean()
         if num_bins == 4:
             cutoffsc[j] = [min(data_col)[0], low_mu, mu, high_mu]
         else:
             cutoffsc[j] = [min(data_col)[0], (low_mu - min(data_col)[0])/2, mu, (high_mu-mu)/2, high_mu, (max(data_col)[0]-high_mu)/2]
     cutoffs = [dmat.min() + bin_size * i for i in range(num_bins)]
     #epsilon = float(alpha * 1) / len(covar_matrix)
     for label in [0,1]:
         # transpose to go by column
         sub_data = hw3.transpose_array(hw3.get_sub_at_value(data_row, truth, label))
         model[label] = hw3.bins_per_column(sub_data, cutoffs)
         model[label] = hw3.bins_per_column_by_col(sub_data, cutoffsc)
         # probability of bin given label
     self.y_prob = float(sum(truth))/len(truth)
     self.cutoffs = cutoffsc
     return model
 def model_gaussian_rand_var_train(self, data, truth):
     mus = {}
     std_dev = {}
     for label in [0,1]:
         sub_data = hw3.get_sub_at_value(data, truth, label)
         mus[label] = hw3.get_mus(sub_data)
         std_dev[label] = hw3.get_std_dev(sub_data)
     self.y_prob = float(sum(truth))/len(truth)
     return [mus, std_dev, float(sum(truth))/len(truth)]
    def model_average_train(self, data_row, truth):
        """ return [prob_over_given_1, prob_over_given_0, prob_y1]
        prob_over_give_x = col1[mu, var, proabality], colx[mu, var, prob] ...
        """
        mus = hw3.get_mus(data_row)
        is_not_spam = hw3.get_sub_at_value(data_row, truth, 0)
        is_spam = hw3.get_sub_at_value(data_row, truth, 1)
        prob_over = get_prob_over(data_row, mus)
        prob_over_given_1 = get_prob_over(is_spam, mus)
        prob_over_given_0 = get_prob_over(is_not_spam, mus)
        l0 = len(prob_over_given_0)
        l1 = len(prob_over_given_1)
        if l1 != l0:
            addx = abs(l1-l0)
            fake_row = [0 for _ in range(addx)]
            if l1 > l0:
                prob_over_given_0 = fake_row
            else:
                prob_over_given_1 = fake_row
        prob_y1 = float(sum(truth))/len(truth)
        self.y_prob = prob_y1

        return [prob_over_given_1, prob_over_given_0, prob_over, prob_y1]
Ejemplo n.º 4
0
    def initialize(self, data, k=2):
        # start with k = 2 and std_dev = 1
        self.k = k
        self.labels = [ki for ki in range(self.k)]
        models = [EMModel() for _ in range(self.k)]

        mucheat = mu_cheat(hw3.transpose_array(data), k)
        for ki in range(self.k):
            #models[ki].random_mus(data)
            models[ki].mu = mucheat[ki]

        self.labels = self.assign_labels(data, models)
        #self.labels = self.assign_labels2(data, model)

        self.prevent_empty(data)

        for ki in range(self.k):
            sub_data = hw3.get_sub_at_value(data, self.labels, ki)
            #models[ki].sigma = hw3.get_covar(sub_data)
            models[ki].sigma = hw3.get_covar(data)
            #models[ki].weight = float(len(sub_data)) / len(data)
            models[ki].weight = .5
            models[ki].likelihood = self.expectation(data, models[ki])  # multivarate_normal
        self.models = models