def test_in_box(self):
        dtype = [("a", np.int)]
        x = np.array([(0,), (1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,)], dtype=dtype)
        boxlim = np.array([(1,), (8,)], dtype=dtype)
        correct_result = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int)
        result = sdutil._in_box(x, boxlim)

        self.assertTrue(np.all(correct_result == result))

        dtype = [("a", np.int), ("b", np.int)]
        x = np.array([(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9)], dtype=dtype)
        boxlim = np.array([(1, 0), (8, 7)], dtype=dtype)
        correct_result = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int)
        result = sdutil._in_box(x, boxlim)

        self.assertTrue(np.all(correct_result == result))

        dtype = [("a", np.float), ("b", np.int), ("c", np.object)]
        x = np.array(
            [
                (0.1, 0, "a"),
                (1.1, 1, "a"),
                (2.1, 2, "b"),
                (3.1, 3, "b"),
                (4.1, 4, "c"),
                (5.1, 5, "c"),
                (6.1, 6, "d"),
                (7.1, 7, "d"),
                (8.1, 8, "e"),
                (9.1, 9, "e"),
            ],
            dtype=dtype,
        )
        boxlim = np.array([(1.2, 0, set(["a", "b"])), (8.0, 7, set(["a", "b"]))], dtype=dtype)
        correct_result = np.array([2, 3], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        self.assertTrue(np.all(correct_result == result))

        boxlim = np.array(
            [(0.1, 0, set(["a", "b", "c", "d", "e"])), (9.1, 9, set(["a", "b", "c", "d", "e"]))], dtype=dtype
        )
        correct_result = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        self.assertTrue(np.all(correct_result == result))
Ejemplo n.º 2
0
 def _regression_stats(self, box, box_init):
     indices = sdutil._in_box(self.x, box)
         
     y_in_box = self.y[indices]
     
     boxstats = {'mean': np.mean(y_in_box),
                 'mass':y_in_box.shape[0]/self.y.shape[0],
                 'res dim':sdutil._determine_nr_restricted_dims(box,
                                                                box_init)}
     return boxstats
Ejemplo n.º 3
0
    def _regression_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)

        y_in_box = self.y[indices]

        boxstats = {
            'mean': np.mean(y_in_box),
            'mass': y_in_box.shape[0] / self.y.shape[0],
            'res dim': sdutil._determine_nr_restricted_dims(box, box_init)
        }
        return boxstats
Ejemplo n.º 4
0
 def _binary_stats(self, box, box_init):
     indices = sdutil._in_box(self.x, box)
         
     y_in_box = self.y[indices]
     box_coi = np.sum(y_in_box)
     
     boxstats = {'coverage': box_coi/np.sum(self.y),
                 'density': box_coi/y_in_box.shape[0],
                 'res dim':sdutil._determine_nr_restricted_dims(box,
                                                                box_init),
                 'mass':y_in_box.shape[0]/self.y.shape[0]}
     return boxstats
Ejemplo n.º 5
0
    def test_in_box(self):
        dtype = [('a', np.int)]
        x = np.array([(0, ), (1, ), (2, ), (3, ), (4, ), (5, ), (6, ), (7, ),
                      (8, ), (9, )],
                     dtype=dtype)
        boxlim = np.array([(1, ), (8, )], dtype=dtype)
        correct_result = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int)
        result = sdutil._in_box(x, boxlim)

        self.assertTrue(np.all(correct_result == result))

        dtype = [('a', np.int), ('b', np.int)]
        x = np.array([(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6),
                      (7, 7), (8, 8), (9, 9)],
                     dtype=dtype)
        boxlim = np.array([(1, 0), (8, 7)], dtype=dtype)
        correct_result = np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.int)
        result = sdutil._in_box(x, boxlim)

        self.assertTrue(np.all(correct_result == result))

        dtype = [('a', np.float), ('b', np.int), ('c', np.object)]
        x = np.array([(0.1, 0, 'a'), (1.1, 1, 'a'), (2.1, 2, 'b'),
                      (3.1, 3, 'b'), (4.1, 4, 'c'), (5.1, 5, 'c'),
                      (6.1, 6, 'd'), (7.1, 7, 'd'), (8.1, 8, 'e'),
                      (9.1, 9, 'e')],
                     dtype=dtype)
        boxlim = np.array([(1.2, 0, set(['a', 'b'])),
                           (8.0, 7, set(['a', 'b']))],
                          dtype=dtype)
        correct_result = np.array([2, 3], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        self.assertTrue(np.all(correct_result == result))

        boxlim = np.array([(0.1, 0, set(['a', 'b', 'c', 'd', 'e'])),
                           (9.1, 9, set(['a', 'b', 'c', 'd', 'e']))],
                          dtype=dtype)
        correct_result = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        self.assertTrue(np.all(correct_result == result))
Ejemplo n.º 6
0
    def _binary_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)

        y_in_box = self.y[indices]
        box_coi = np.sum(y_in_box)

        boxstats = {
            'coverage': box_coi / np.sum(self.y),
            'density': box_coi / y_in_box.shape[0],
            'res dim': sdutil._determine_nr_restricted_dims(box, box_init),
            'mass': y_in_box.shape[0] / self.y.shape[0]
        }
        return boxstats
Ejemplo n.º 7
0
 def _calculate_quasi_p(self, i):
     '''helper function for calculating quasi-p values as discussed in 
     Bryant and Lempert (2010). This is a one sided  binomial test. 
     
     Parameters
     ----------
     i: int
        the specific box in the peeling trajectory for which the quasi-p 
        values are to be calculated.
     
     '''
     
     box_lim = self.box_lims[i]
     restricted_dims = list(sdutil._determine_restricted_dims(box_lim,
                                                        self.prim.box_init))
     
     # total nr. of cases in box
     Tbox = self.peeling_trajectory['mass'][i] * self.prim.n 
     
     # total nr. of cases of interest in box
     Hbox = self.peeling_trajectory['coverage'][i] * self.prim.t_coi  
     
     qp_values = {}
     
     for u in restricted_dims:
         temp_box = copy.deepcopy(box_lim)
         temp_box[u] = self.box_lims[0][u]
     
         indices = sdutil._in_box(self.prim.x[self.prim.yi_remaining], 
                                  temp_box)
         indices = self.prim.yi_remaining[indices]
         
         # total nr. of cases in box with one restriction removed
         Tj = indices.shape[0]  
         
         # total nr. of cases of interest in box with one restriction 
         # removed
         Hj = np.sum(self.prim.y[indices])
         
         p = Hj/Tj
         
         Hbox = int(Hbox)
         Tbox = int(Tbox)
         
         qp = binom.sf(Hbox-1, Tbox, p)
         qp_values[u] = qp
         
     return qp_values
Ejemplo n.º 8
0
 def drop_restriction(self, uncertainty):
     '''
     drop the restriction on the specified dimension. That is, replace
     the limits in the chosen box with a new box where for the specified 
     uncertainty the limits of the initial box are being used. The resulting
     box is added to the peeling trajectory.
     
     Parameters
     ----------
     uncertainty : string
     
     '''
     
     new_box_lim = copy.deepcopy(self.box_lim)
     new_box_lim[uncertainty][:] = self.box_lims[0][uncertainty][:]
     indices = sdutil._in_box(self.prim.x[self.prim.yi_remaining], 
                              new_box_lim)
     indices = self.prim.yi_remaining[indices]
     self.update(new_box_lim, indices)
Ejemplo n.º 9
0
 def select(self, i):
     '''        
     select an entry from the peeling and pasting trajectory and update
     the prim box to this selected box.
     
     Parameters
     ----------
     i : int
         the index of the box to select.
     
     '''
     if self._frozen:
         raise PrimException("""box has been frozen because PRIM has found 
                             at least one more recent box""")
     
     indices = sdutil._in_box(self.prim.x[self.prim.yi_remaining], 
                              self.box_lims[i])
     self.yi = self.prim.yi_remaining[indices]
     self._cur_box = i
Ejemplo n.º 10
0
 def _categorical_paste(self, box, u):
     '''
     
     Return a list of pastes, equal to the number of classes currently
     not on the box lim. 
     
     Parameters
     ----------
     box : a PrimBox instance
     u : string
         the uncertainty for which to peel
     
     Returns
     -------
     tuple
         a list of box lims and the associated indices
     
     
     '''
     box_lim = box.box_lims[-1]
     
     c_in_b = box_lim[u][0]
     c_t = self.box_init[u][0]
     
     if len(c_in_b) < len(c_t):
         pastes = []
         possible_cs = c_t - c_in_b
         for entry in possible_cs:
             box_paste = np.copy(box_lim)
             paste = copy.deepcopy(c_in_b)
             paste.add(entry)
             box_paste[u][:] = paste
             indices = sdutil._in_box(self.x[self.yi_remaining], box_paste)
             indices = self.yi_remaining[indices]
             pastes.append((indices, box_paste))
         return pastes
     else:
         # no pastes possible, return empty list
         return []
Ejemplo n.º 11
0
    def _classification_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)
            
        y_in_box = self.y[indices]
        classes = set(self.y)
        classes = list(classes)
        classes.sort()
        
        counts = [y_in_box[y_in_box==ci].shape[0] for ci in classes]

        total_gini = 0
        for count in counts:
            total_gini += (count/y_in_box.shape[0])**2
        gini = 1 - total_gini
        
        boxstats = {'gini': gini,
            'mass':y_in_box.shape[0]/self.y.shape[0],
            'box_composition': counts,
            'res dim':sdutil._determine_nr_restricted_dims(box,
                                                           box_init)}
        
        return boxstats
Ejemplo n.º 12
0
    def _classification_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)

        y_in_box = self.y[indices]
        classes = set(self.y)
        classes = list(classes)
        classes.sort()

        counts = [y_in_box[y_in_box == ci].shape[0] for ci in classes]

        total_gini = 0
        for count in counts:
            total_gini += (count / y_in_box.shape[0])**2
        gini = 1 - total_gini

        boxstats = {
            'gini': gini,
            'mass': y_in_box.shape[0] / self.y.shape[0],
            'box_composition': counts,
            'res dim': sdutil._determine_nr_restricted_dims(box, box_init)
        }

        return boxstats
Ejemplo n.º 13
0
    def _real_paste(self, box, u):
        '''
        
        returns two candidate new boxes, pasted along upper and lower 
        dimension
        
        Parameters
        ----------
        box : a PrimBox instance
        u : string
            the uncertainty for which to peel
        
        Returns
        -------
        tuple
            two box lims and the associated indices
       
        '''

        pastes = []
        for i, direction in enumerate(['lower', 'upper']):
            box_paste = np.copy(box.box_lims[-1])
            paste_box = np.copy(box.box_lims[-1]) # box containing data candidate for pasting
            
            if direction == 'upper':
                paste_box[u][0] = paste_box[u][1]
                paste_box[u][1] = self.box_init[u][1]
                indices = sdutil._in_box(self.x[self.yi_remaining], paste_box)
                data = self.x[self.yi_remaining][indices][u]
                
                paste_value = self.box_init[u][i]
                if data.shape[0] > 0:
                    paste_value = get_quantile(data, self.paste_alpha)
                    
                assert paste_value >= box.box_lims[-1][u][i]
                    
            elif direction == 'lower':
                paste_box[u][0] = self.box_init[u][0]
                paste_box[u][1] = box_paste[u][0]
                
                indices = sdutil._in_box(self.x[self.yi_remaining], paste_box)
                data = self.x[self.yi_remaining][indices][u]
                
                paste_value = self.box_init[u][i]
                if data.shape[0] > 0:
                    paste_value = get_quantile(data, 1-self.paste_alpha)
           
                if not paste_value <= box.box_lims[-1][u][i]:
                    print("{}, {}".format(paste_value, box.box_lims[-1][u][i]))
            
            
            dtype = box_paste.dtype.fields[u][0]
            if dtype==np.int32:
                paste_value = np.int(paste_value)
            
            box_paste[u][i] = paste_value
            indices = sdutil._in_box(self.x[self.yi_remaining], box_paste)
            indices = self.yi_remaining[indices]
            
            pastes.append((indices, box_paste))
    
        return pastes        
    def test_in_box(self):
        dtype = [('a', np.int)]
        x = np.array([(0,),
                      (1,),
                      (2,),
                      (3,),
                      (4,),
                      (5,),
                      (6,),
                      (7,),
                      (8,),
                      (9,)], 
                     dtype=dtype)
        boxlim = np.array([(1,),
                           (8,)], dtype=dtype)
        correct_result = np.array([1,2,3,4,5,6,7,8], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        
        self.assertTrue(np.all(correct_result==result))

        dtype = [('a', np.int),
                 ('b', np.int)]
        x = np.array([(0,0),
                      (1,1),
                      (2,2),
                      (3,3),
                      (4,4),
                      (5,5),
                      (6,6),
                      (7,7),
                      (8,8),
                      (9,9)], 
                     dtype=dtype)
        boxlim = np.array([(1,0),
                           (8,7)], dtype=dtype)
        correct_result = np.array([1,2,3,4,5,6,7], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        
        self.assertTrue(np.all(correct_result==result))
    
        dtype = [('a', np.float),
                 ('b', np.int),
                 ('c', np.object)]
        x = np.array([(0.1, 0, 'a'),
                      (1.1, 1, 'a'),
                      (2.1, 2, 'b'),
                      (3.1, 3, 'b'),
                      (4.1, 4, 'c'),
                      (5.1, 5, 'c'),
                      (6.1, 6, 'd'),
                      (7.1, 7, 'd'),
                      (8.1, 8, 'e'),
                      (9.1, 9, 'e')], 
                     dtype=dtype)
        boxlim = np.array([(1.2,0, set(['a','b'])),
                           (8.0,7, set(['a','b']) )], dtype=dtype)
        correct_result = np.array([2,3], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        self.assertTrue(np.all(correct_result==result))
        
        boxlim = np.array([(0.1, 0, set(['a','b','c','d','e'])),
                           (9.1, 9, set(['a','b','c','d','e']) )], dtype=dtype)
        correct_result = np.array([0,1,2,3,4,5,6,7,8,9], dtype=np.int)
        result = sdutil._in_box(x, boxlim)
        self.assertTrue(np.all(correct_result==result))