def test_in_box(self): dtype = [("a", np.int)] x = np.array([(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)], dtype=dtype) boxlim = np.array([(1,), (8,)], dtype=dtype) correct_result = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result)) dtype = [("a", np.int), ("b", np.int)] x = np.array([(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9)], dtype=dtype) boxlim = np.array([(1, 0), (8, 7)], dtype=dtype) correct_result = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result)) dtype = [("a", np.float), ("b", np.int), ("c", np.object)] x = np.array( [ (0.1, 0, "a"), (1.1, 1, "a"), (2.1, 2, "b"), (3.1, 3, "b"), (4.1, 4, "c"), (5.1, 5, "c"), (6.1, 6, "d"), (7.1, 7, "d"), (8.1, 8, "e"), (9.1, 9, "e"), ], dtype=dtype, ) boxlim = np.array([(1.2, 0, set(["a", "b"])), (8.0, 7, set(["a", "b"]))], dtype=dtype) correct_result = np.array([2, 3], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result)) boxlim = np.array( [(0.1, 0, set(["a", "b", "c", "d", "e"])), (9.1, 9, set(["a", "b", "c", "d", "e"]))], dtype=dtype ) correct_result = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result))
def _regression_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] boxstats = {'mean': np.mean(y_in_box), 'mass':y_in_box.shape[0]/self.y.shape[0], 'res dim':sdutil._determine_nr_restricted_dims(box, box_init)} return boxstats
def _regression_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] boxstats = { 'mean': np.mean(y_in_box), 'mass': y_in_box.shape[0] / self.y.shape[0], 'res dim': sdutil._determine_nr_restricted_dims(box, box_init) } return boxstats
def _binary_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] box_coi = np.sum(y_in_box) boxstats = {'coverage': box_coi/np.sum(self.y), 'density': box_coi/y_in_box.shape[0], 'res dim':sdutil._determine_nr_restricted_dims(box, box_init), 'mass':y_in_box.shape[0]/self.y.shape[0]} return boxstats
def test_in_box(self): dtype = [('a', np.int)] x = np.array([(0, ), (1, ), (2, ), (3, ), (4, ), (5, ), (6, ), (7, ), (8, ), (9, )], dtype=dtype) boxlim = np.array([(1, ), (8, )], dtype=dtype) correct_result = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result)) dtype = [('a', np.int), ('b', np.int)] x = np.array([(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9)], dtype=dtype) boxlim = np.array([(1, 0), (8, 7)], dtype=dtype) correct_result = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result)) dtype = [('a', np.float), ('b', np.int), ('c', np.object)] x = np.array([(0.1, 0, 'a'), (1.1, 1, 'a'), (2.1, 2, 'b'), (3.1, 3, 'b'), (4.1, 4, 'c'), (5.1, 5, 'c'), (6.1, 6, 'd'), (7.1, 7, 'd'), (8.1, 8, 'e'), (9.1, 9, 'e')], dtype=dtype) boxlim = np.array([(1.2, 0, set(['a', 'b'])), (8.0, 7, set(['a', 'b']))], dtype=dtype) correct_result = np.array([2, 3], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result)) boxlim = np.array([(0.1, 0, set(['a', 'b', 'c', 'd', 'e'])), (9.1, 9, set(['a', 'b', 'c', 'd', 'e']))], dtype=dtype) correct_result = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result == result))
def _binary_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] box_coi = np.sum(y_in_box) boxstats = { 'coverage': box_coi / np.sum(self.y), 'density': box_coi / y_in_box.shape[0], 'res dim': sdutil._determine_nr_restricted_dims(box, box_init), 'mass': y_in_box.shape[0] / self.y.shape[0] } return boxstats
def _calculate_quasi_p(self, i): '''helper function for calculating quasi-p values as discussed in Bryant and Lempert (2010). This is a one sided binomial test. Parameters ---------- i: int the specific box in the peeling trajectory for which the quasi-p values are to be calculated. ''' box_lim = self.box_lims[i] restricted_dims = list(sdutil._determine_restricted_dims(box_lim, self.prim.box_init)) # total nr. of cases in box Tbox = self.peeling_trajectory['mass'][i] * self.prim.n # total nr. of cases of interest in box Hbox = self.peeling_trajectory['coverage'][i] * self.prim.t_coi qp_values = {} for u in restricted_dims: temp_box = copy.deepcopy(box_lim) temp_box[u] = self.box_lims[0][u] indices = sdutil._in_box(self.prim.x[self.prim.yi_remaining], temp_box) indices = self.prim.yi_remaining[indices] # total nr. of cases in box with one restriction removed Tj = indices.shape[0] # total nr. of cases of interest in box with one restriction # removed Hj = np.sum(self.prim.y[indices]) p = Hj/Tj Hbox = int(Hbox) Tbox = int(Tbox) qp = binom.sf(Hbox-1, Tbox, p) qp_values[u] = qp return qp_values
def drop_restriction(self, uncertainty): ''' drop the restriction on the specified dimension. That is, replace the limits in the chosen box with a new box where for the specified uncertainty the limits of the initial box are being used. The resulting box is added to the peeling trajectory. Parameters ---------- uncertainty : string ''' new_box_lim = copy.deepcopy(self.box_lim) new_box_lim[uncertainty][:] = self.box_lims[0][uncertainty][:] indices = sdutil._in_box(self.prim.x[self.prim.yi_remaining], new_box_lim) indices = self.prim.yi_remaining[indices] self.update(new_box_lim, indices)
def select(self, i): ''' select an entry from the peeling and pasting trajectory and update the prim box to this selected box. Parameters ---------- i : int the index of the box to select. ''' if self._frozen: raise PrimException("""box has been frozen because PRIM has found at least one more recent box""") indices = sdutil._in_box(self.prim.x[self.prim.yi_remaining], self.box_lims[i]) self.yi = self.prim.yi_remaining[indices] self._cur_box = i
def _categorical_paste(self, box, u): ''' Return a list of pastes, equal to the number of classes currently not on the box lim. Parameters ---------- box : a PrimBox instance u : string the uncertainty for which to peel Returns ------- tuple a list of box lims and the associated indices ''' box_lim = box.box_lims[-1] c_in_b = box_lim[u][0] c_t = self.box_init[u][0] if len(c_in_b) < len(c_t): pastes = [] possible_cs = c_t - c_in_b for entry in possible_cs: box_paste = np.copy(box_lim) paste = copy.deepcopy(c_in_b) paste.add(entry) box_paste[u][:] = paste indices = sdutil._in_box(self.x[self.yi_remaining], box_paste) indices = self.yi_remaining[indices] pastes.append((indices, box_paste)) return pastes else: # no pastes possible, return empty list return []
def _classification_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] classes = set(self.y) classes = list(classes) classes.sort() counts = [y_in_box[y_in_box==ci].shape[0] for ci in classes] total_gini = 0 for count in counts: total_gini += (count/y_in_box.shape[0])**2 gini = 1 - total_gini boxstats = {'gini': gini, 'mass':y_in_box.shape[0]/self.y.shape[0], 'box_composition': counts, 'res dim':sdutil._determine_nr_restricted_dims(box, box_init)} return boxstats
def _classification_stats(self, box, box_init): indices = sdutil._in_box(self.x, box) y_in_box = self.y[indices] classes = set(self.y) classes = list(classes) classes.sort() counts = [y_in_box[y_in_box == ci].shape[0] for ci in classes] total_gini = 0 for count in counts: total_gini += (count / y_in_box.shape[0])**2 gini = 1 - total_gini boxstats = { 'gini': gini, 'mass': y_in_box.shape[0] / self.y.shape[0], 'box_composition': counts, 'res dim': sdutil._determine_nr_restricted_dims(box, box_init) } return boxstats
def _real_paste(self, box, u): ''' returns two candidate new boxes, pasted along upper and lower dimension Parameters ---------- box : a PrimBox instance u : string the uncertainty for which to peel Returns ------- tuple two box lims and the associated indices ''' pastes = [] for i, direction in enumerate(['lower', 'upper']): box_paste = np.copy(box.box_lims[-1]) paste_box = np.copy(box.box_lims[-1]) # box containing data candidate for pasting if direction == 'upper': paste_box[u][0] = paste_box[u][1] paste_box[u][1] = self.box_init[u][1] indices = sdutil._in_box(self.x[self.yi_remaining], paste_box) data = self.x[self.yi_remaining][indices][u] paste_value = self.box_init[u][i] if data.shape[0] > 0: paste_value = get_quantile(data, self.paste_alpha) assert paste_value >= box.box_lims[-1][u][i] elif direction == 'lower': paste_box[u][0] = self.box_init[u][0] paste_box[u][1] = box_paste[u][0] indices = sdutil._in_box(self.x[self.yi_remaining], paste_box) data = self.x[self.yi_remaining][indices][u] paste_value = self.box_init[u][i] if data.shape[0] > 0: paste_value = get_quantile(data, 1-self.paste_alpha) if not paste_value <= box.box_lims[-1][u][i]: print("{}, {}".format(paste_value, box.box_lims[-1][u][i])) dtype = box_paste.dtype.fields[u][0] if dtype==np.int32: paste_value = np.int(paste_value) box_paste[u][i] = paste_value indices = sdutil._in_box(self.x[self.yi_remaining], box_paste) indices = self.yi_remaining[indices] pastes.append((indices, box_paste)) return pastes
def test_in_box(self): dtype = [('a', np.int)] x = np.array([(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)], dtype=dtype) boxlim = np.array([(1,), (8,)], dtype=dtype) correct_result = np.array([1,2,3,4,5,6,7,8], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result==result)) dtype = [('a', np.int), ('b', np.int)] x = np.array([(0,0), (1,1), (2,2), (3,3), (4,4), (5,5), (6,6), (7,7), (8,8), (9,9)], dtype=dtype) boxlim = np.array([(1,0), (8,7)], dtype=dtype) correct_result = np.array([1,2,3,4,5,6,7], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result==result)) dtype = [('a', np.float), ('b', np.int), ('c', np.object)] x = np.array([(0.1, 0, 'a'), (1.1, 1, 'a'), (2.1, 2, 'b'), (3.1, 3, 'b'), (4.1, 4, 'c'), (5.1, 5, 'c'), (6.1, 6, 'd'), (7.1, 7, 'd'), (8.1, 8, 'e'), (9.1, 9, 'e')], dtype=dtype) boxlim = np.array([(1.2,0, set(['a','b'])), (8.0,7, set(['a','b']) )], dtype=dtype) correct_result = np.array([2,3], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result==result)) boxlim = np.array([(0.1, 0, set(['a','b','c','d','e'])), (9.1, 9, set(['a','b','c','d','e']) )], dtype=dtype) correct_result = np.array([0,1,2,3,4,5,6,7,8,9], dtype=np.int) result = sdutil._in_box(x, boxlim) self.assertTrue(np.all(correct_result==result))