def test_determine_nr_restricted_dims(self):
        x = np.random.rand(10)
        x = np.asarray(x, dtype=[("a", np.float), ("b", np.float)])

        # all dimensions the same
        box_init = sdutil._make_box(x)
        n = sdutil._determine_nr_restricted_dims(box_init, box_init)

        self.assertEqual(n, 0)

        # dimensions 1 different and dimension 2 the same
        b = np.array([(1, 1), (0, 1)], dtype=[("a", np.float), ("b", np.float)])
        n = sdutil._determine_nr_restricted_dims(b, box_init)
        self.assertEqual(n, 2)
Ejemplo n.º 2
0
    def test_determine_nr_restricted_dims(self):
        x = np.random.rand(10, )
        x = np.asarray(x, dtype=[('a', np.float), ('b', np.float)])

        # all dimensions the same
        box_init = sdutil._make_box(x)
        n = sdutil._determine_nr_restricted_dims(box_init, box_init)

        self.assertEqual(n, 0)

        # dimensions 1 different and dimension 2 the same
        b = np.array([(1, 1), (0, 1)],
                     dtype=[('a', np.float), ('b', np.float)])
        n = sdutil._determine_nr_restricted_dims(b, box_init)
        self.assertEqual(n, 2)
Ejemplo n.º 3
0
    def update(self, box_lims, indices):
        '''
        
        update the box to the provided box limits.
        
        Parameters
        ----------
        box_lims: structured numpy array
                  the new box_lims
        indices: numpy array
                 the indices of y that are inside the box
      
        '''
        self.yi = indices
        
        y = self.prim.y[self.yi]

        self.box_lims.append(box_lims)

        coi = self.prim.determine_coi(self.yi)

        data = {'coverage':coi/self.prim.t_coi, 
                'density':coi/y.shape[0],  
                'mean':np.mean(y),
                'res dim':sdutil._determine_nr_restricted_dims(self.box_lims[-1], 
                                                              self.prim.box_init),
                'mass':y.shape[0]/self.prim.n}
        new_row = pd.DataFrame([data])
        self.peeling_trajectory = self.peeling_trajectory.append(new_row, 
                                                             ignore_index=True)
        
        self._cur_box = len(self.peeling_trajectory)-1
Ejemplo n.º 4
0
 def _regression_stats(self, box, box_init):
     indices = sdutil._in_box(self.x, box)
         
     y_in_box = self.y[indices]
     
     boxstats = {'mean': np.mean(y_in_box),
                 'mass':y_in_box.shape[0]/self.y.shape[0],
                 'res dim':sdutil._determine_nr_restricted_dims(box,
                                                                box_init)}
     return boxstats
Ejemplo n.º 5
0
    def _regression_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)

        y_in_box = self.y[indices]

        boxstats = {
            'mean': np.mean(y_in_box),
            'mass': y_in_box.shape[0] / self.y.shape[0],
            'res dim': sdutil._determine_nr_restricted_dims(box, box_init)
        }
        return boxstats
Ejemplo n.º 6
0
 def _binary_stats(self, box, box_init):
     indices = sdutil._in_box(self.x, box)
         
     y_in_box = self.y[indices]
     box_coi = np.sum(y_in_box)
     
     boxstats = {'coverage': box_coi/np.sum(self.y),
                 'density': box_coi/y_in_box.shape[0],
                 'res dim':sdutil._determine_nr_restricted_dims(box,
                                                                box_init),
                 'mass':y_in_box.shape[0]/self.y.shape[0]}
     return boxstats
Ejemplo n.º 7
0
    def _peel(self, box):
        '''
        
        Executes the peeling phase of the PRIM algorithm. Delegates peeling
        to data type specific helper methods.

        '''
    
        mass_old = box.yi.shape[0]/self.n

        x = self.x[box.yi]
       
        #identify all possible peels
        possible_peels = []
        for entry in x.dtype.descr:
            u = entry[0]
            dtype = x.dtype.fields.get(u)[0].name
            peels = self._peels[dtype](self, box, u, x)
            [possible_peels.append(entry) for entry in peels] 
        if not possible_peels:
            # there is no peel identified, so return box
            return box

        # determine the scores for each peel in order
        # to identify the next candidate box
        scores = []
        for entry in possible_peels:
            i, box_lim = entry
            obj = self.obj_func(self, self.y[box.yi],  self.y[i])
            non_res_dim = len(x.dtype.descr)-\
                          sdutil._determine_nr_restricted_dims(box_lim, 
                                                              self.box_init)
            score = (obj, non_res_dim, box_lim, i)
            scores.append(score)

        scores.sort(key=itemgetter(0,1), reverse=True)
        entry = scores[0]
        
        
        obj_score = entry[0]
        box_new, indices = entry[2:]
        
        mass_new = self.y[indices].shape[0]/self.n
       
        if (mass_new >= self.mass_min) &\
           (mass_new < mass_old)&\
           (obj_score>0):
            box.update(box_new, indices)
            return self._peel(box)
        else:
            #else return received box
            return box
Ejemplo n.º 8
0
    def _binary_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)

        y_in_box = self.y[indices]
        box_coi = np.sum(y_in_box)

        boxstats = {
            'coverage': box_coi / np.sum(self.y),
            'density': box_coi / y_in_box.shape[0],
            'res dim': sdutil._determine_nr_restricted_dims(box, box_init),
            'mass': y_in_box.shape[0] / self.y.shape[0]
        }
        return boxstats
Ejemplo n.º 9
0
    def _paste(self, box):
        ''' Executes the pasting phase of the PRIM. Delegates pasting to data 
        type specific helper methods.'''
        
        x = self.x[self.yi_remaining]
        
        mass_old = box.yi.shape[0]/self.n
        
        res_dim = sdutil._determine_restricted_dims(box.box_lims[-1],
                                                    self.box_init)
        
        possible_pastes = []
        for u in res_dim:
            debug("pasting "+u)
            dtype = self.x.dtype.fields.get(u)[0].name
            pastes = self._pastes[dtype](self, box, u)
            [possible_pastes.append(entry) for entry in pastes] 
        if not possible_pastes:
            # there is no peel identified, so return box
            return box
    
        # determine the scores for each peel in order
        # to identify the next candidate box
        scores = []
        for entry in possible_pastes:
            i, box_lim = entry
            obj = self.obj_func(self, self.y[box.yi],  self.y[i])
            non_res_dim = len(x.dtype.descr)-\
                          sdutil._determine_nr_restricted_dims(box_lim,
                                                              self.box_init)
            score = (obj, non_res_dim, box_lim, i)
            scores.append(score)

        scores.sort(key=itemgetter(0,1), reverse=True)
        entry = scores[0]
        box_new, indices = entry[2:]
        mass_new = self.y[indices].shape[0]/self.n
        
        mean_old = np.mean(self.y[box.yi])
        mean_new = np.mean(self.y[indices])
        
        if (mass_new >= self.mass_min) &\
           (mass_new > mass_old) &\
           (mean_old <= mean_new):
            box.update(box_new, indices)
            return self._paste(box)
        else:
            #else return received box
            return box
Ejemplo n.º 10
0
    def _classification_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)
            
        y_in_box = self.y[indices]
        classes = set(self.y)
        classes = list(classes)
        classes.sort()
        
        counts = [y_in_box[y_in_box==ci].shape[0] for ci in classes]

        total_gini = 0
        for count in counts:
            total_gini += (count/y_in_box.shape[0])**2
        gini = 1 - total_gini
        
        boxstats = {'gini': gini,
            'mass':y_in_box.shape[0]/self.y.shape[0],
            'box_composition': counts,
            'res dim':sdutil._determine_nr_restricted_dims(box,
                                                           box_init)}
        
        return boxstats
Ejemplo n.º 11
0
    def _classification_stats(self, box, box_init):
        indices = sdutil._in_box(self.x, box)

        y_in_box = self.y[indices]
        classes = set(self.y)
        classes = list(classes)
        classes.sort()

        counts = [y_in_box[y_in_box == ci].shape[0] for ci in classes]

        total_gini = 0
        for count in counts:
            total_gini += (count / y_in_box.shape[0])**2
        gini = 1 - total_gini

        boxstats = {
            'gini': gini,
            'mass': y_in_box.shape[0] / self.y.shape[0],
            'box_composition': counts,
            'res dim': sdutil._determine_nr_restricted_dims(box, box_init)
        }

        return boxstats