Esempio n. 1
0
    def labelInstances(self, select_ind, client: Client = None, verbose=False):
        # For each selected instance retrieve from the simOracle the labeled instances
        labels, cost = self._oracle.query(instances=self._X[select_ind],
                                          indexes=select_ind)

        labels_iterator = zip(select_ind, labels)

        for item in labels_iterator:
            item_shape = np.shape(item[1]) if isinstance(
                item[1], (list, np.ndarray)) else da.shape(item[1])
            if len(item_shape) == len(da.shape(np.asarray(labels))):
                new_item = item[1]
            else:
                new_item = [item[1]]

            if item[0] == 0:  # choose the first item
                result = da.concatenate([new_item, self._Y[item[0] + 1:]],
                                        axis=0)
            elif item[0] == len(self._Y) - 1:  # choose the last item
                result = da.concatenate([self._Y[:item[0]], new_item], axis=0)
            else:  # any other item
                result = da.concatenate(
                    [self._Y[:item[0]], new_item, self._Y[item[0] + 1:]],
                    axis=0)

        self._Y = result.persist()

        if client is not None:
            client.rebalance(self._Y)

        if verbose:
            print("Label: %s, Cost: %s" % (labels, cost))
Esempio n. 2
0
    def _get_pred(self, unlabel_x, model, proba=True, **kwargs):
        """
        Get the prediction results of the unlabeled set.

        Parameters
        ----------
        :param unlabel_x: array
            The [n_samples, n_features]  matrix of the unlabeled set.
        :param model: object
            Model object which has the prediction capabilities.
        :param proba: bool
            Whether to get the prediction  for the unlabeled dataset or the prediction probabilities
        :param kwargs: optional

        Returns
        -------
        pv: dask.array
            Probability predictions matrix with shape [n_samples, n_classes].
        """

        if proba:
            if not hasattr(model, 'predict_proba'):
                raise Exception('model object must implement predict_proba methods in current algorithm.')
            proba = model.predict_proba(unlabel_x)
            pv = da.asarray(proba)
            spv = da.shape(pv)

            if len(spv) != 2 or spv[1] == 1:
                raise Exception('2d array with [n_samples, n_class] is expected, but received: \n%s' % str(pv))

            return pv.persist()
        else:
            pv = model.predict(unlabel_x, kwargs)
            return pv
Esempio n. 3
0
    def _select_by_prediction(self, unlabel_index, predict, batch_size=1, **kwargs):
        """
        Perform basic validation for indexes selection for querying

        Parameters
        ----------
        :param unlabel_index: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples. Should be one-to-one
            correspondence to the prediction matrix.
        :param predict: dask.array, [n_samples, n_classes]
            The prediction matrix for the unlabeled set.
        :param kwargs: optional
        """

        if batch_size <= 0:
            raise Exception('batch_size param must be greater or equal than 1 ')

        assert (isinstance(unlabel_index, collections.abc.Iterable))
        unlabel_index = np.asarray(unlabel_index)

        if len(unlabel_index) <= batch_size:
            return unlabel_index

        predict_shape = da.shape(predict)

        if len(predict_shape) != 2 or predict_shape[1] == 1:
            raise Exception('2d array with the shape [n_samples, n_classes]'
                            ' is expected, but received shape: \n%s' % str(predict_shape))
Esempio n. 4
0
    def _select_by_prediction(self, unlabel_index, predict, batch_size=1):
        predict_shape = da.shape(predict)

        assert (len(predict_shape) in [1, 2])
        if len(predict_shape) == 2:
            if predict_shape[1] != 1:
                raise Exception(
                    '1d or 2d with 1 column array is expected, but received: \n%s'
                    % str(predict))
            else:
                pv = da.absolute(predict.flatten())
        else:
            pv = da.absolute(predict)

        tpl = da.from_array(unlabel_index)
        return tpl[nsmallestarg(pv, batch_size)].compute()
Esempio n. 5
0
    def __init__(self,
                 client: Client,
                 X,
                 Y,
                 ml_technique,
                 scenario_type: AbstractScenario,
                 performance_metrics: [],
                 query_strategy: SingleLabelIndexQuery,
                 oracle: Oracle,
                 stopping_criteria: AbstractStopCriterion,
                 self_partition: bool,
                 kfolds: int = 1,
                 batch_size=1,
                 **kwargs):
        """
        Parameters
        ----------
        :param client: distributed.Client
        :param X: array-like
            Data matrix with [n_samples, n_features]
        :param Y: array-like, optional
            labels of given data [n_samples, n_labels] or [n_samples]
        :param ml_technique
        :param scenario_type: Sub-Type of AbstractScenario
            Type of Active Learning scenario to use
        :param performance_metrics: array-like of BaseMetrics elements
        :param query_strategy: SinlgeLabelIndexQuery
        :param oracle: Oracle
        :param stopping_criteria: AbstractStopCriterion
        :param self_partition: bool
        :param kfolds: int, optional (default=1)
             If self_partition is True Random split data k sets according to the extra parameters
                -> test_ratio: float, optional (default=0.3)
                    Ratio of test set
                -> initial_label_rate: float, optional (default=0.05)
                    Ratio of initial label set
                    e.g. Initial_labelset*(1-test_ratio)*n_samples
                -> all_class: bool, optional (default=True)
                    Whether each split will contain at least one instance for each class.
                    If False, a totally random split will be performed.

            If self_partition is False the following the following parameter must be specified
                -> train_idx:
                -> test_idx:
                -> label_idx:
                ->  unlabel_idx:
        :param kwargs: optional
            Extra parameters
        """
        self._client = client

        if type(X) is da.core.Array:
            self._X = X.persist()
        else:
            self._X = da.from_array(X, chunks=len(X) // 50).persist()

        if isinstance(Y, da.core.Array):
            self._Y = Y.persist()
        else:
            self._Y = da.from_array(Y, chunks=len(Y) // 50).persist()

        # Persists the Dask Storage Structures
        if client is not None and kwargs.pop("rebalance", False):
            client.rebalance(self._X)
            client.rebalance(self._Y)

        check_X_y(self._X,
                  self._Y,
                  accept_sparse='csc',
                  multi_output=True,
                  distributed=False)

        self._scenario_type = scenario_type
        if self._scenario_type is None:
            raise ValueError("required param 'scenario_type' can not be empty")
        if not issubclass(self._scenario_type, AbstractScenario):
            raise ValueError(
                "the 'scenario_type' must be a subclass of 'AbstractScenario'")

        if self_partition:
            self._kfolds = kfolds
            self._train_idx, self._test_idx, self._label_idx, self._unlabel_idx = split(
                X=self._X,
                y=self._Y,
                test_ratio=kwargs.pop("test_ratio", 0.3),
                initial_label_rate=kwargs.pop("initial_label_rate", 0.05),
                split_count=self._kfolds,
                all_class=kwargs.pop("all_class", True))
        else:
            train_idx = kwargs.pop("train_idx", None)
            test_idx = kwargs.pop("test_idx", None)
            label_idx = kwargs.pop("label_idx", None)
            unlabel_idx = kwargs.pop("unlabel_idx", None)

            if train_idx is None:
                raise ValueError(
                    "required param 'train_idx' can not be empty ")
            if test_idx is None:
                raise ValueError("required param 'test_idx' can not be empty ")
            if label_idx is None:
                raise ValueError(
                    "required param 'label_idx' can not be empty ")
            if unlabel_idx is None:
                raise ValueError(
                    "required param 'unlabel_idx' can not be empty ")

            num_inst_x, num_feat = da.shape(self._X)
            num_inst_y, num_labels = da.shape(
                self._Y) if len(da.shape(self._Y)) > 1 else (da.shape(
                    self._Y)[0], 1)
            folds_train, num_inst_train = np.shape(train_idx)
            folds_test, num_inst_test = np.shape(test_idx)
            folds_labeled, num_inst_labeled = np.shape(label_idx)
            folds_unlabeled, num_inst_unlabeled = np.shape(unlabel_idx)

            if num_inst_x != num_inst_y:
                raise ValueError(
                    "Different numbers of instances for inputs (x:%s, y:%s)" %
                    (num_inst_x, num_inst_y))

            if folds_train != folds_test or folds_test != folds_labeled or folds_labeled != folds_unlabeled:
                raise ValueError(
                    "Different numbers of folds for inputs (train_idx:%s, test_idx:%s "
                    "label_idx:%s, unlabel_idx:%s)" %
                    (folds_train, folds_test, folds_labeled, folds_unlabeled))
            if kfolds != folds_test:
                raise ValueError(
                    "Number of folds for inputs (train_idx:%s, test_idx:%s "
                    "label_idx:%s, unlabel_idx:%s) must be equals to kfolds:%s param"
                    % (folds_train, folds_test, folds_labeled, folds_unlabeled,
                       kfolds))

            if num_inst_train + num_inst_test != num_inst_x:
                raise ValueError(
                    "The sum of the number of instances for train_idx and test_idx must be equal to the "
                    "number of instances for x"
                    "(num_inst_x:%s, num_inst_train:%s num_inst_test:%s)" %
                    (num_inst_x, num_inst_train, num_inst_test))

            if num_inst_labeled + num_inst_unlabeled != num_inst_train:
                raise ValueError(
                    "The sum of the number of instances for label_idx and unlabel_idx must be equal to the "
                    "number of instances for train_idx"
                    "(num_inst_labeled:%s, num_inst_unlabeled:%s num_inst_unlabeled:%s)"
                    %
                    (num_inst_labeled, num_inst_unlabeled, num_inst_unlabeled))

            self._kfolds = folds_train
            self._train_idx = train_idx
            self._test_idx = test_idx
            self._label_idx = label_idx
            self._unlabel_idx = unlabel_idx

        self._ml_technique = ml_technique
        if self._ml_technique is None:
            raise ValueError("required param 'ml_technique' can not be empty")

        self._performance_metrics = performance_metrics
        if self._performance_metrics is None or len(
                self._performance_metrics) == 0:
            raise ValueError(
                "required param 'performance_metric' can not be empty")
        else:
            for metric in self._performance_metrics:
                if not isinstance(metric, BaseMetrics):
                    raise ValueError(
                        "the elements in 'performance_metrics' must be of type BaseMetrics"
                    )

        self._query_strategy = query_strategy
        if self._query_strategy is None:
            raise ValueError(
                "required param 'query_strategy' can not be empty")

        self._oracle = oracle
        if self._oracle is None:
            raise ValueError("required param 'simOracle' can not be empty")

        self._stopping_criteria = stopping_criteria
        if self._stopping_criteria is None:
            raise ValueError(
                "required param 'stopping_criteria' can not be empty")

        # Dynamically create the scenario Type given the arguments
        importlib.import_module(self._scenario_type.__module__)

        self._scenario = eval(self._scenario_type.__qualname__)(
            X=self._X,
            y=self._Y,
            train_idx=self._train_idx[0],
            test_idx=self._test_idx[0],
            label_idx=copy.deepcopy(IndexCollection(self._label_idx[0])),
            unlabel_idx=copy.deepcopy(IndexCollection(self._unlabel_idx[0])),
            ml_technique=self._ml_technique,
            performance_metrics=self._performance_metrics,
            query_strategy=self._query_strategy,
            oracle=self._oracle,
            batch_size=batch_size)
Esempio n. 6
0
    def select(self, X, y, label_index, unlabel_index, batch_size=1, model=None, client: Client = None):
        """
        Select indexes from the unlabel_index for querying.

        Parameters
        ----------

        :param X: array
            The [n_samples, n_features] training samples with n-features per instance.
        :param y: array
            The The [n_samples] label vector.
        :param label_index: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples.
        :param unlabel_index: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples.
        :param model: object, optional (default=None)
            Current classification model, should have the 'predict_proba' method for probabilistic output.
            If not provided, LogisticRegression with default parameters implemented by sklearn will be used.
        :param client:

        Returns
        -------
        selected_idx: list
            The selected indexes which is a subset of unlabel_index.
        """

        if batch_size <= 0:
            raise Exception('batch_size param must be greater or equal than 1 ')

        assert (isinstance(unlabel_index, collections.Iterable))
        assert (isinstance(label_index, collections.Iterable))

        if len(unlabel_index) <= batch_size:
            return unlabel_index

        if X is None or y is None:
            raise Exception('Data matrix is not provided.')

        if model is None:
            raise Exception('Model is not provided.')

        label_index = np.asarray(label_index)
        unlabel_index = np.asarray(unlabel_index)

        scores = da.from_array([])
        classes = da.unique(y).compute()
        pv = self._get_pred(X[unlabel_index, :], proba=True, model=model)
        predict_shape = da.shape(pv)

        # for each class
        for i in range(predict_shape[0]):
            new_train_X = delayed(
                X[da.concatenate([da.from_array(label_index), da.from_array([unlabel_index[i]])], axis=0), :])
            unlabel_ind = list(unlabel_index)
            unlabel_ind.pop(i)
            new_unlabel_X = delayed(X[unlabel_ind, :])
            score = da.from_array([])

            for yi in classes:
                new_model = delayed(copy.deepcopy(model))

                if client is not None:
                    with joblib.parallel_backend("dask"):
                        delayed(new_model.fit(new_train_X,
                                              y[da.concatenate([da.from_array(label_index), da.from_array([yi])],
                                                               axis=0)]))
                        prob = delayed(new_model.predict_proba(new_unlabel_X))
                else:
                    delayed(new_model.fit(new_train_X,
                                          y[da.concatenate([da.from_array(label_index), da.from_array([yi])], axis=0)]))
                    prob = delayed(new_model.predict_proba(new_unlabel_X))

                score = da.concatenate([score, da.from_array([pv[i, yi] * self._loss(prob.compute())])], axis=0)

            scores = da.concatenate([scores, da.from_array([da.sum(score)])], axis=0)

        return unlabel_index, scores