def test_determine_nr_restricted_dims(self): x = np.random.rand(10) x = np.asarray(x, dtype=[("a", np.float), ("b", np.float)]) # all dimensions the same box_init = sdutil._make_box(x) n = sdutil._determine_nr_restricted_dims(box_init, box_init) self.assertEqual(n, 0) # dimensions 1 different and dimension 2 the same b = np.array([(1, 1), (0, 1)], dtype=[("a", np.float), ("b", np.float)]) n = sdutil._determine_nr_restricted_dims(b, box_init) self.assertEqual(n, 2)
def test_determine_nr_restricted_dims(self): x = np.random.rand(10, ) x = np.asarray(x, dtype=[('a', np.float), ('b', np.float)]) # all dimensions the same box_init = sdutil._make_box(x) n = sdutil._determine_nr_restricted_dims(box_init, box_init) self.assertEqual(n, 0) # dimensions 1 different and dimension 2 the same b = np.array([(1, 1), (0, 1)], dtype=[('a', np.float), ('b', np.float)]) n = sdutil._determine_nr_restricted_dims(b, box_init) self.assertEqual(n, 2)
def update(self, box_lims, indices): ''' update the box to the provided box limits. Parameters ---------- box_lims: structured numpy array the new box_lims indices: numpy array the indices of y that are inside the box ''' self.yi = indices y = self.prim.y[self.yi] self.box_lims.append(box_lims) coi = self.prim.determine_coi(self.yi) data = {'coverage':coi/self.prim.t_coi, 'density':coi/y.shape[0], 'mean':np.mean(y), 'res dim':sdutil._determine_nr_restricted_dims(self.box_lims[-1], self.prim.box_init), 'mass':y.shape[0]/self.prim.n} new_row = pd.DataFrame([data]) self.peeling_trajectory = self.peeling_trajectory.append(new_row, ignore_index=True) self._cur_box = len(self.peeling_trajectory)-1
def _regression_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] boxstats = {'mean': np.mean(y_in_box), 'mass':y_in_box.shape[0]/self.y.shape[0], 'res dim':sdutil._determine_nr_restricted_dims(box, box_init)} return boxstats
def _regression_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] boxstats = { 'mean': np.mean(y_in_box), 'mass': y_in_box.shape[0] / self.y.shape[0], 'res dim': sdutil._determine_nr_restricted_dims(box, box_init) } return boxstats
def _binary_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] box_coi = np.sum(y_in_box) boxstats = {'coverage': box_coi/np.sum(self.y), 'density': box_coi/y_in_box.shape[0], 'res dim':sdutil._determine_nr_restricted_dims(box, box_init), 'mass':y_in_box.shape[0]/self.y.shape[0]} return boxstats
def _peel(self, box): ''' Executes the peeling phase of the PRIM algorithm. Delegates peeling to data type specific helper methods. ''' mass_old = box.yi.shape[0]/self.n x = self.x[box.yi] #identify all possible peels possible_peels = [] for entry in x.dtype.descr: u = entry[0] dtype = x.dtype.fields.get(u)[0].name peels = self._peels[dtype](self, box, u, x) [possible_peels.append(entry) for entry in peels] if not possible_peels: # there is no peel identified, so return box return box # determine the scores for each peel in order # to identify the next candidate box scores = [] for entry in possible_peels: i, box_lim = entry obj = self.obj_func(self, self.y[box.yi], self.y[i]) non_res_dim = len(x.dtype.descr)-\ sdutil._determine_nr_restricted_dims(box_lim, self.box_init) score = (obj, non_res_dim, box_lim, i) scores.append(score) scores.sort(key=itemgetter(0,1), reverse=True) entry = scores[0] obj_score = entry[0] box_new, indices = entry[2:] mass_new = self.y[indices].shape[0]/self.n if (mass_new >= self.mass_min) &\ (mass_new < mass_old)&\ (obj_score>0): box.update(box_new, indices) return self._peel(box) else: #else return received box return box
def _binary_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] box_coi = np.sum(y_in_box) boxstats = { 'coverage': box_coi / np.sum(self.y), 'density': box_coi / y_in_box.shape[0], 'res dim': sdutil._determine_nr_restricted_dims(box, box_init), 'mass': y_in_box.shape[0] / self.y.shape[0] } return boxstats
def _paste(self, box): ''' Executes the pasting phase of the PRIM. Delegates pasting to data type specific helper methods.''' x = self.x[self.yi_remaining] mass_old = box.yi.shape[0]/self.n res_dim = sdutil._determine_restricted_dims(box.box_lims[-1], self.box_init) possible_pastes = [] for u in res_dim: debug("pasting "+u) dtype = self.x.dtype.fields.get(u)[0].name pastes = self._pastes[dtype](self, box, u) [possible_pastes.append(entry) for entry in pastes] if not possible_pastes: # there is no peel identified, so return box return box # determine the scores for each peel in order # to identify the next candidate box scores = [] for entry in possible_pastes: i, box_lim = entry obj = self.obj_func(self, self.y[box.yi], self.y[i]) non_res_dim = len(x.dtype.descr)-\ sdutil._determine_nr_restricted_dims(box_lim, self.box_init) score = (obj, non_res_dim, box_lim, i) scores.append(score) scores.sort(key=itemgetter(0,1), reverse=True) entry = scores[0] box_new, indices = entry[2:] mass_new = self.y[indices].shape[0]/self.n mean_old = np.mean(self.y[box.yi]) mean_new = np.mean(self.y[indices]) if (mass_new >= self.mass_min) &\ (mass_new > mass_old) &\ (mean_old <= mean_new): box.update(box_new, indices) return self._paste(box) else: #else return received box return box
def _classification_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] classes = set(self.y) classes = list(classes) classes.sort() counts = [y_in_box[y_in_box==ci].shape[0] for ci in classes] total_gini = 0 for count in counts: total_gini += (count/y_in_box.shape[0])**2 gini = 1 - total_gini boxstats = {'gini': gini, 'mass':y_in_box.shape[0]/self.y.shape[0], 'box_composition': counts, 'res dim':sdutil._determine_nr_restricted_dims(box, box_init)} return boxstats
def _classification_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] classes = set(self.y) classes = list(classes) classes.sort() counts = [y_in_box[y_in_box == ci].shape[0] for ci in classes] total_gini = 0 for count in counts: total_gini += (count / y_in_box.shape[0])**2 gini = 1 - total_gini boxstats = { 'gini': gini, 'mass': y_in_box.shape[0] / self.y.shape[0], 'box_composition': counts, 'res dim': sdutil._determine_nr_restricted_dims(box, box_init) } return boxstats