def labelInstances(self, select_ind, client: Client = None, verbose=False): # For each selected instance retrieve from the simOracle the labeled instances labels, cost = self._oracle.query(instances=self._X[select_ind], indexes=select_ind) labels_iterator = zip(select_ind, labels) for item in labels_iterator: item_shape = np.shape(item[1]) if isinstance( item[1], (list, np.ndarray)) else da.shape(item[1]) if len(item_shape) == len(da.shape(np.asarray(labels))): new_item = item[1] else: new_item = [item[1]] if item[0] == 0: # choose the first item result = da.concatenate([new_item, self._Y[item[0] + 1:]], axis=0) elif item[0] == len(self._Y) - 1: # choose the last item result = da.concatenate([self._Y[:item[0]], new_item], axis=0) else: # any other item result = da.concatenate( [self._Y[:item[0]], new_item, self._Y[item[0] + 1:]], axis=0) self._Y = result.persist() if client is not None: client.rebalance(self._Y) if verbose: print("Label: %s, Cost: %s" % (labels, cost))
def _get_pred(self, unlabel_x, model, proba=True, **kwargs): """ Get the prediction results of the unlabeled set. Parameters ---------- :param unlabel_x: array The [n_samples, n_features] matrix of the unlabeled set. :param model: object Model object which has the prediction capabilities. :param proba: bool Whether to get the prediction for the unlabeled dataset or the prediction probabilities :param kwargs: optional Returns ------- pv: dask.array Probability predictions matrix with shape [n_samples, n_classes]. """ if proba: if not hasattr(model, 'predict_proba'): raise Exception('model object must implement predict_proba methods in current algorithm.') proba = model.predict_proba(unlabel_x) pv = da.asarray(proba) spv = da.shape(pv) if len(spv) != 2 or spv[1] == 1: raise Exception('2d array with [n_samples, n_class] is expected, but received: \n%s' % str(pv)) return pv.persist() else: pv = model.predict(unlabel_x, kwargs) return pv
def _select_by_prediction(self, unlabel_index, predict, batch_size=1, **kwargs): """ Perform basic validation for indexes selection for querying Parameters ---------- :param unlabel_index: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. Should be one-to-one correspondence to the prediction matrix. :param predict: dask.array, [n_samples, n_classes] The prediction matrix for the unlabeled set. :param kwargs: optional """ if batch_size <= 0: raise Exception('batch_size param must be greater or equal than 1 ') assert (isinstance(unlabel_index, collections.abc.Iterable)) unlabel_index = np.asarray(unlabel_index) if len(unlabel_index) <= batch_size: return unlabel_index predict_shape = da.shape(predict) if len(predict_shape) != 2 or predict_shape[1] == 1: raise Exception('2d array with the shape [n_samples, n_classes]' ' is expected, but received shape: \n%s' % str(predict_shape))
def _select_by_prediction(self, unlabel_index, predict, batch_size=1): predict_shape = da.shape(predict) assert (len(predict_shape) in [1, 2]) if len(predict_shape) == 2: if predict_shape[1] != 1: raise Exception( '1d or 2d with 1 column array is expected, but received: \n%s' % str(predict)) else: pv = da.absolute(predict.flatten()) else: pv = da.absolute(predict) tpl = da.from_array(unlabel_index) return tpl[nsmallestarg(pv, batch_size)].compute()
def __init__(self, client: Client, X, Y, ml_technique, scenario_type: AbstractScenario, performance_metrics: [], query_strategy: SingleLabelIndexQuery, oracle: Oracle, stopping_criteria: AbstractStopCriterion, self_partition: bool, kfolds: int = 1, batch_size=1, **kwargs): """ Parameters ---------- :param client: distributed.Client :param X: array-like Data matrix with [n_samples, n_features] :param Y: array-like, optional labels of given data [n_samples, n_labels] or [n_samples] :param ml_technique :param scenario_type: Sub-Type of AbstractScenario Type of Active Learning scenario to use :param performance_metrics: array-like of BaseMetrics elements :param query_strategy: SinlgeLabelIndexQuery :param oracle: Oracle :param stopping_criteria: AbstractStopCriterion :param self_partition: bool :param kfolds: int, optional (default=1) If self_partition is True Random split data k sets according to the extra parameters -> test_ratio: float, optional (default=0.3) Ratio of test set -> initial_label_rate: float, optional (default=0.05) Ratio of initial label set e.g. Initial_labelset*(1-test_ratio)*n_samples -> all_class: bool, optional (default=True) Whether each split will contain at least one instance for each class. If False, a totally random split will be performed. If self_partition is False the following the following parameter must be specified -> train_idx: -> test_idx: -> label_idx: -> unlabel_idx: :param kwargs: optional Extra parameters """ self._client = client if type(X) is da.core.Array: self._X = X.persist() else: self._X = da.from_array(X, chunks=len(X) // 50).persist() if isinstance(Y, da.core.Array): self._Y = Y.persist() else: self._Y = da.from_array(Y, chunks=len(Y) // 50).persist() # Persists the Dask Storage Structures if client is not None and kwargs.pop("rebalance", False): client.rebalance(self._X) client.rebalance(self._Y) check_X_y(self._X, self._Y, accept_sparse='csc', multi_output=True, distributed=False) self._scenario_type = scenario_type if self._scenario_type is None: raise ValueError("required param 'scenario_type' can not be empty") if not issubclass(self._scenario_type, AbstractScenario): raise ValueError( "the 'scenario_type' must be a subclass of 'AbstractScenario'") if self_partition: self._kfolds = kfolds self._train_idx, self._test_idx, self._label_idx, self._unlabel_idx = split( X=self._X, y=self._Y, test_ratio=kwargs.pop("test_ratio", 0.3), initial_label_rate=kwargs.pop("initial_label_rate", 0.05), split_count=self._kfolds, all_class=kwargs.pop("all_class", True)) else: train_idx = kwargs.pop("train_idx", None) test_idx = kwargs.pop("test_idx", None) label_idx = kwargs.pop("label_idx", None) unlabel_idx = kwargs.pop("unlabel_idx", None) if train_idx is None: raise ValueError( "required param 'train_idx' can not be empty ") if test_idx is None: raise ValueError("required param 'test_idx' can not be empty ") if label_idx is None: raise ValueError( "required param 'label_idx' can not be empty ") if unlabel_idx is None: raise ValueError( "required param 'unlabel_idx' can not be empty ") num_inst_x, num_feat = da.shape(self._X) num_inst_y, num_labels = da.shape( self._Y) if len(da.shape(self._Y)) > 1 else (da.shape( self._Y)[0], 1) folds_train, num_inst_train = np.shape(train_idx) folds_test, num_inst_test = np.shape(test_idx) folds_labeled, num_inst_labeled = np.shape(label_idx) folds_unlabeled, num_inst_unlabeled = np.shape(unlabel_idx) if num_inst_x != num_inst_y: raise ValueError( "Different numbers of instances for inputs (x:%s, y:%s)" % (num_inst_x, num_inst_y)) if folds_train != folds_test or folds_test != folds_labeled or folds_labeled != folds_unlabeled: raise ValueError( "Different numbers of folds for inputs (train_idx:%s, test_idx:%s " "label_idx:%s, unlabel_idx:%s)" % (folds_train, folds_test, folds_labeled, folds_unlabeled)) if kfolds != folds_test: raise ValueError( "Number of folds for inputs (train_idx:%s, test_idx:%s " "label_idx:%s, unlabel_idx:%s) must be equals to kfolds:%s param" % (folds_train, folds_test, folds_labeled, folds_unlabeled, kfolds)) if num_inst_train + num_inst_test != num_inst_x: raise ValueError( "The sum of the number of instances for train_idx and test_idx must be equal to the " "number of instances for x" "(num_inst_x:%s, num_inst_train:%s num_inst_test:%s)" % (num_inst_x, num_inst_train, num_inst_test)) if num_inst_labeled + num_inst_unlabeled != num_inst_train: raise ValueError( "The sum of the number of instances for label_idx and unlabel_idx must be equal to the " "number of instances for train_idx" "(num_inst_labeled:%s, num_inst_unlabeled:%s num_inst_unlabeled:%s)" % (num_inst_labeled, num_inst_unlabeled, num_inst_unlabeled)) self._kfolds = folds_train self._train_idx = train_idx self._test_idx = test_idx self._label_idx = label_idx self._unlabel_idx = unlabel_idx self._ml_technique = ml_technique if self._ml_technique is None: raise ValueError("required param 'ml_technique' can not be empty") self._performance_metrics = performance_metrics if self._performance_metrics is None or len( self._performance_metrics) == 0: raise ValueError( "required param 'performance_metric' can not be empty") else: for metric in self._performance_metrics: if not isinstance(metric, BaseMetrics): raise ValueError( "the elements in 'performance_metrics' must be of type BaseMetrics" ) self._query_strategy = query_strategy if self._query_strategy is None: raise ValueError( "required param 'query_strategy' can not be empty") self._oracle = oracle if self._oracle is None: raise ValueError("required param 'simOracle' can not be empty") self._stopping_criteria = stopping_criteria if self._stopping_criteria is None: raise ValueError( "required param 'stopping_criteria' can not be empty") # Dynamically create the scenario Type given the arguments importlib.import_module(self._scenario_type.__module__) self._scenario = eval(self._scenario_type.__qualname__)( X=self._X, y=self._Y, train_idx=self._train_idx[0], test_idx=self._test_idx[0], label_idx=copy.deepcopy(IndexCollection(self._label_idx[0])), unlabel_idx=copy.deepcopy(IndexCollection(self._unlabel_idx[0])), ml_technique=self._ml_technique, performance_metrics=self._performance_metrics, query_strategy=self._query_strategy, oracle=self._oracle, batch_size=batch_size)
def select(self, X, y, label_index, unlabel_index, batch_size=1, model=None, client: Client = None): """ Select indexes from the unlabel_index for querying. Parameters ---------- :param X: array The [n_samples, n_features] training samples with n-features per instance. :param y: array The The [n_samples] label vector. :param label_index: {list, np.ndarray, IndexCollection} The indexes of labeled samples. :param unlabel_index: {list, np.ndarray, IndexCollection} The indexes of unlabeled samples. :param model: object, optional (default=None) Current classification model, should have the 'predict_proba' method for probabilistic output. If not provided, LogisticRegression with default parameters implemented by sklearn will be used. :param client: Returns ------- selected_idx: list The selected indexes which is a subset of unlabel_index. """ if batch_size <= 0: raise Exception('batch_size param must be greater or equal than 1 ') assert (isinstance(unlabel_index, collections.Iterable)) assert (isinstance(label_index, collections.Iterable)) if len(unlabel_index) <= batch_size: return unlabel_index if X is None or y is None: raise Exception('Data matrix is not provided.') if model is None: raise Exception('Model is not provided.') label_index = np.asarray(label_index) unlabel_index = np.asarray(unlabel_index) scores = da.from_array([]) classes = da.unique(y).compute() pv = self._get_pred(X[unlabel_index, :], proba=True, model=model) predict_shape = da.shape(pv) # for each class for i in range(predict_shape[0]): new_train_X = delayed( X[da.concatenate([da.from_array(label_index), da.from_array([unlabel_index[i]])], axis=0), :]) unlabel_ind = list(unlabel_index) unlabel_ind.pop(i) new_unlabel_X = delayed(X[unlabel_ind, :]) score = da.from_array([]) for yi in classes: new_model = delayed(copy.deepcopy(model)) if client is not None: with joblib.parallel_backend("dask"): delayed(new_model.fit(new_train_X, y[da.concatenate([da.from_array(label_index), da.from_array([yi])], axis=0)])) prob = delayed(new_model.predict_proba(new_unlabel_X)) else: delayed(new_model.fit(new_train_X, y[da.concatenate([da.from_array(label_index), da.from_array([yi])], axis=0)])) prob = delayed(new_model.predict_proba(new_unlabel_X)) score = da.concatenate([score, da.from_array([pv[i, yi] * self._loss(prob.compute())])], axis=0) scores = da.concatenate([scores, da.from_array([da.sum(score)])], axis=0) return unlabel_index, scores