コード例 #1
0
 def _get_model_features(self):
     """
     Compute and get the most important features of the current model
     """
     SubCMediansWrapper_c.get_features(self._features,
                                       self._p_subcmedians_c)
     return self._features + [self.generation]
コード例 #2
0
 def fit(self, X, y=None, verbose=1):
     """
     sklearn-like fit function, receives a dataset and build the subspace clustering that models the data
     """
     print ""
     if X is None:
         return None
     if X.size < self.N:
         raise RuntimeError(
             'The dataset provided is smaller than the sample size, use instead the fit_online function'
         )
     X_ = self._check_X_matrix_validity(X)
     self._set_data_sample(X_, y)
     for iteration in xrange(self.NbIter):
         random_element = np.random.randint(
             0, len(self.data_objects_index_not_in_sample))
         random_index = self.data_objects_index_not_in_sample.pop(
             random_element)
         data_object_index_removed_from_sample = self.data_objects_index_in_sample.pop(
             0)
         self.data_objects_index_in_sample.append(random_index)
         self.data_objects_index_not_in_sample.append(
             data_object_index_removed_from_sample)
         if y:
             self._send_array(X_[random_index, :], y[random_index])
         else:
             self._send_array(X_[random_index, :])
         SubCMediansWrapper_c.train_model_with_SubCMedianspoint(
             self._p_subcmedians_c, self._data_object)
         self.generation += 1
         if verbose:
             sys.stdout.write("\r" + str(iteration) + "/" +
                              str(self.NbIter))
             sys.stdout.flush()
     print ""
コード例 #3
0
 def __init__(self,
              SDmax=STD_SDmax,
              D=STD_D,
              N=STD_N,
              NbIter=STD_NbIter,
              threshold_cluster_validity=STD_THRESHOLD_CLUSTER_VALIDITY,
              seed=STD_SEED,
              option_deletion=STD_OPT_DEL,
              option_insertion=STD_OPT_INS,
              option_FIFO=STD_FIFO,
              option_train_with_latest=STD_TRAIN_WITH_LATEST,
              option_lazy_hill_climbing=STD_LAZY_HILL_CLIMBING,
              population_size=STD_LAMBDA,
              nb_generations_generation_update=STD_ETA):
     """
     Creates a SubCMedians customizable object. This version has more options than the one presented in the paper, we suggest to use the SubCMedians object instead.
     """
     self.SDmax = SDmax
     self.D = D
     self.N = N
     self.NbIter = NbIter
     self.threshold_cluster_validity = threshold_cluster_validity
     self.option_deletion = option_deletion
     self.option_insertion = option_insertion
     self.option_FIFO = option_FIFO
     self.option_train_with_latest = option_train_with_latest
     self.seed = seed
     self.population_size = population_size
     self.nb_generations_generation_update = nb_generations_generation_update
     self.option_lazy_hill_climbing = option_lazy_hill_climbing
     self._p_subcmedians_c = SubCMediansWrapper_c.generate_SubCMediansclust(
         SDmax, D, N, threshold_cluster_validity, seed, option_deletion,
         option_insertion, option_FIFO, option_train_with_latest,
         option_lazy_hill_climbing, population_size,
         nb_generations_generation_update)
     self._model_getter = []
     self._distances_to_cluster_getter = []
     self._lengths = []
     self._features = []
     self._object_class_cluster = []
     self._cluster_getter = []
     self._aggregatedstats = {}
     self._prng = SubCMediansWrapper_c.generate_prng(self.seed)
     self._stream = SubCMediansWrapper_c.generate_array_SubCMedians_point(
         self._prng, N, D)
     self._data_object = SubCMediansWrapper_c.generate_SubCMedians_point(
         self._prng, D)
     self._cluster_object = SubCMediansWrapper_c.generate_SubCMedians_point(
         self._prng, SDmax)
     self.time_start = timer()
     self._parameters = [
         "SDmax", "D", "N", "M", "option_deletion", "option_insertion",
         "option_FIFO", "option_train_with_latest", "seed",
         "option_lazy_hill_climbing", "population_size",
         "nb_generations_generation_update"
     ]
     self.generation = 0
コード例 #4
0
 def _get_subcmedians_model(self):
     """
     Get SubCMedians current model
     """
     SubCMediansWrapper_c.get_SubCMediansclust_model(
         self._model_getter, self._lengths, self._p_subcmedians_c)
     local_model = [
         self._model_getter[i][0:self._lengths[i + 1]]
         for i in xrange(self._lengths[0])
     ]
     return local_model
コード例 #5
0
 def _transform_array(self, x):
     """
     Apply the transform function to objects x in order to compute the distance to each candidate center in the model.
     """
     self._send_array(x)
     cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model(
         self._p_subcmedians_c, self._data_object)
     SubCMediansWrapper_c.get_distances_to_core_point(
         cluster, self._p_subcmedians_c, self._data_object,
         self._distances_to_cluster_getter)
     return array(self._distances_to_cluster_getter)
コード例 #6
0
 def _get_class_clusters_current_data_sample(self):
     """
     Get the class / cluster membership of te current data sample
     """
     class_cluster_df = DataFrame(columns=["class", "cluster"])
     size_D = SubCMediansWrapper_c.get_data_window_size(
         self._p_subcmedians_c)
     for i in xrange(size_D):
         SubCMediansWrapper_c.get_D_point_class_cluster(
             i, self._p_subcmedians_c, self._object_class_cluster)
         class_cluster_df.loc[i] = self._object_class_cluster
     return class_cluster_df
コード例 #7
0
 def _send_array(self, x, y=None):
     """
     Send an data object represented as a numpy array or a list to the C library
     """
     scm_py_list = [0 for _ in xrange(POINTDESCRIPTORS)]
     for i, dim_pos in enumerate(x):
         if not isnan(dim_pos):
             scm_py_list.append([i, 1, float(dim_pos)])
     if y is not None:
         scm_py_list[POINTCLASSID] = int(y)
     scm_py_list[POINTWEIGHT] = len(scm_py_list) - POINTDESCRIPTORS
     SubCMediansWrapper_c.py2C_convert_SubCMedianspoint(
         scm_py_list, self._data_object)
コード例 #8
0
 def _cluster_data_object(self, x, y=None):
     """
     Sends a data objected encoded as a numpy array or a list and cluster it
     """
     self._send_array(x, y)
     cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model(
         self._p_subcmedians_c, self._data_object)
     return [int(y), cluster], distance
コード例 #9
0
 def _set_data_sample(self, X, y=None):
     """
     Set the data sample objects drawing randomly objects from the dataset X
     """
     self.data_objects_index_in_sample = []
     self.data_objects_index_not_in_sample = range(len(X))
     for _ in xrange(self.N):
         random_element = np.random.randint(
             0, len(self.data_objects_index_not_in_sample))
         random_index = self.data_objects_index_not_in_sample.pop(
             random_element)
         self.data_objects_index_in_sample.append(random_index)
         if y:
             self._send_array(X[random_index, :], y[random_index])
         else:
             self._send_array(X[random_index, :])
         SubCMediansWrapper_c.insert_SubCMedians_point_in_D(
             self._p_subcmedians_c, self._data_object)
コード例 #10
0
 def _check_consistency_C_params_Py_params(self):
     """
     Check the consistency of the C parameters with respect to Python object parameters
     """
     c_parameters = SubCMediansWrapper_c.get_parameters(
         self._p_subcmedians_c)
     for i, param in enumerate(self._parameters):
         if getattr(self, param) != c_parameters[i]:
             raise RuntimeError(
                 'C capsule parameters and Python parameters are different '
                 '%s %s != %s' %
                 (param, str(getattr(self, param)), c_parameters[i]))
コード例 #11
0
 def score(self, X):
     """
     Compute the mean intra-cluster distance
     """
     X_ = self._check_X_matrix_validity(X)
     scores = []
     for i, x in enumerate(X_):
         self._send_array(x)
         cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model(
             self._p_subcmedians_c, self._data_object)
         scores.append(distance)
     return np.asarray(scores).mean()
コード例 #12
0
 def predict(self, X):
     """
     sklearn-like predict function, receives a dataset and compute the cluster membership of its data objects
     """
     X_ = self._check_X_matrix_validity(X)
     Y_ = array([])
     for i, x in enumerate(X_):
         self._send_array(x)
         cluster, distance = SubCMediansWrapper_c.clusterize_SubCMedianspoint_with_model(
             self._p_subcmedians_c, self._data_object)
         Y_ = append(Y_, cluster)
     return Y_
コード例 #13
0
 def set_subspace_model(self, model, base_weight=1):
     model_translation = []
     total_size = 0
     for i, center in enumerate(model):
         scm_py_list = [0 for _ in xrange(POINTDESCRIPTORS)]
         w = 0
         for dim, dim_pos in enumerate(center):
             if not isnan(dim_pos):
                 scm_py_list.append([dim, base_weight, float(dim_pos)])
                 w += 1
         total_size += w
         scm_py_list[POINTINDEX] = i
         scm_py_list[POINTWEIGHT] = w
         model_translation.append(scm_py_list)
     if total_size > self.SDmax:
         raise ValueError(
             'Invalid new model size %s for estimator %s.'
             'Check the size of your model and provide a smaller or equal size model'
             'with `SubCMedians.SDmax`.' % (total_size, self))
     else:
         SubCMediansWrapper_c.clone_SubCMedians_point_from_list(
             model_translation, self._p_subcmedians_c)
コード例 #14
0
 def fit_online_mode(self, X, y=None):
     """
     Sklearn-like fit function, receives a dataset and build the subspace clustering that models the data.
     This function has been created to deal with streams of data, in this case the dataset provided as an input will never appear again, so it does not make sense to keep record of the sample used or not
     """
     if X is None:
         return None
     X_ = self._check_X_matrix_validity(X)
     if len(X_.shape) == 1:
         self._send_array(X_, y)
         SubCMediansWrapper_c.train_model_with_SubCMedianspoint(
             self._p_subcmedians_c, self._data_object)
         self.generation += 1
     else:
         for i, x in enumerate(X_):
             if y:
                 self._send_array(x, y[i])
             else:
                 self._send_array(x)
             SubCMediansWrapper_c.train_model_with_SubCMedianspoint(
                 self._p_subcmedians_c, self._data_object)
             self.generation += 1
コード例 #15
0
    def set_params(self, **params):
        """
        Set the parameters provided to the construtor
        """
        if not params:
            self._reallocate_memory()
            return self
        for name in params:
            if not hasattr(self, name):
                raise ValueError('Invalid parameter %s for estimator %s.'
                                 'Check the list of available parameters '
                                 'with `SubCMedians.get_params().keys()`.' %
                                 (name, self))
            setattr(self, name, params[name])

        SubCMediansWrapper_c.set_parameters(
            self._p_subcmedians_c, self.SDmax, self.D, self.N,
            self.threshold_cluster_validity, self.seed, self.option_deletion,
            self.option_insertion, self.option_FIFO,
            self.option_train_with_latest, self.option_lazy_hill_climbing,
            self.population_size, self.nb_generations_generation_update)
        self._reallocate_memory()
        return self
コード例 #16
0
 def _train_on_current_training_set(self, iterations):
     """
     Train the SubCMedians algorithm without updating the dataset sample
     """
     for i in xrange(iterations):
         SubCMediansWrapper_c.train_on_current_D(self._p_subcmedians_c)
コード例 #17
0
 def _print_me(self):
     """
     Print description regarding the current SubCMedians model
     """
     SubCMediansWrapper_c.print_SubCMediansClust(self._p_subcmedians_c)