Example #1
0
    def labelInstances(self, select_ind, client: Client = None, verbose=False):
        # For each selected instance retrieve from the simOracle the labeled instances
        labels, cost = self._oracle.query(instances=self._X[select_ind],
                                          indexes=select_ind)

        labels_iterator = zip(select_ind, labels)

        for item in labels_iterator:
            item_shape = np.shape(item[1]) if isinstance(
                item[1], (list, np.ndarray)) else da.shape(item[1])
            if len(item_shape) == len(da.shape(np.asarray(labels))):
                new_item = item[1]
            else:
                new_item = [item[1]]

            if item[0] == 0:  # choose the first item
                result = da.concatenate([new_item, self._Y[item[0] + 1:]],
                                        axis=0)
            elif item[0] == len(self._Y) - 1:  # choose the last item
                result = da.concatenate([self._Y[:item[0]], new_item], axis=0)
            else:  # any other item
                result = da.concatenate(
                    [self._Y[:item[0]], new_item, self._Y[item[0] + 1:]],
                    axis=0)

        self._Y = result.persist()

        if client is not None:
            client.rebalance(self._Y)

        if verbose:
            print("Label: %s, Cost: %s" % (labels, cost))
Example #2
0
    def __init__(self,
                 client: Client,
                 X,
                 Y,
                 ml_technique,
                 scenario_type: AbstractScenario,
                 performance_metrics: [],
                 query_strategy: SingleLabelIndexQuery,
                 oracle: Oracle,
                 stopping_criteria: AbstractStopCriterion,
                 self_partition: bool,
                 kfolds: int = 1,
                 batch_size=1,
                 **kwargs):
        """
        Parameters
        ----------
        :param client: distributed.Client
        :param X: array-like
            Data matrix with [n_samples, n_features]
        :param Y: array-like, optional
            labels of given data [n_samples, n_labels] or [n_samples]
        :param ml_technique
        :param scenario_type: Sub-Type of AbstractScenario
            Type of Active Learning scenario to use
        :param performance_metrics: array-like of BaseMetrics elements
        :param query_strategy: SinlgeLabelIndexQuery
        :param oracle: Oracle
        :param stopping_criteria: AbstractStopCriterion
        :param self_partition: bool
        :param kfolds: int, optional (default=1)
             If self_partition is True Random split data k sets according to the extra parameters
                -> test_ratio: float, optional (default=0.3)
                    Ratio of test set
                -> initial_label_rate: float, optional (default=0.05)
                    Ratio of initial label set
                    e.g. Initial_labelset*(1-test_ratio)*n_samples
                -> all_class: bool, optional (default=True)
                    Whether each split will contain at least one instance for each class.
                    If False, a totally random split will be performed.

            If self_partition is False the following the following parameter must be specified
                -> train_idx:
                -> test_idx:
                -> label_idx:
                ->  unlabel_idx:
        :param kwargs: optional
            Extra parameters
        """
        self._client = client

        if type(X) is da.core.Array:
            self._X = X.persist()
        else:
            self._X = da.from_array(X, chunks=len(X) // 50).persist()

        if isinstance(Y, da.core.Array):
            self._Y = Y.persist()
        else:
            self._Y = da.from_array(Y, chunks=len(Y) // 50).persist()

        # Persists the Dask Storage Structures
        if client is not None and kwargs.pop("rebalance", False):
            client.rebalance(self._X)
            client.rebalance(self._Y)

        check_X_y(self._X,
                  self._Y,
                  accept_sparse='csc',
                  multi_output=True,
                  distributed=False)

        self._scenario_type = scenario_type
        if self._scenario_type is None:
            raise ValueError("required param 'scenario_type' can not be empty")
        if not issubclass(self._scenario_type, AbstractScenario):
            raise ValueError(
                "the 'scenario_type' must be a subclass of 'AbstractScenario'")

        if self_partition:
            self._kfolds = kfolds
            self._train_idx, self._test_idx, self._label_idx, self._unlabel_idx = split(
                X=self._X,
                y=self._Y,
                test_ratio=kwargs.pop("test_ratio", 0.3),
                initial_label_rate=kwargs.pop("initial_label_rate", 0.05),
                split_count=self._kfolds,
                all_class=kwargs.pop("all_class", True))
        else:
            train_idx = kwargs.pop("train_idx", None)
            test_idx = kwargs.pop("test_idx", None)
            label_idx = kwargs.pop("label_idx", None)
            unlabel_idx = kwargs.pop("unlabel_idx", None)

            if train_idx is None:
                raise ValueError(
                    "required param 'train_idx' can not be empty ")
            if test_idx is None:
                raise ValueError("required param 'test_idx' can not be empty ")
            if label_idx is None:
                raise ValueError(
                    "required param 'label_idx' can not be empty ")
            if unlabel_idx is None:
                raise ValueError(
                    "required param 'unlabel_idx' can not be empty ")

            num_inst_x, num_feat = da.shape(self._X)
            num_inst_y, num_labels = da.shape(
                self._Y) if len(da.shape(self._Y)) > 1 else (da.shape(
                    self._Y)[0], 1)
            folds_train, num_inst_train = np.shape(train_idx)
            folds_test, num_inst_test = np.shape(test_idx)
            folds_labeled, num_inst_labeled = np.shape(label_idx)
            folds_unlabeled, num_inst_unlabeled = np.shape(unlabel_idx)

            if num_inst_x != num_inst_y:
                raise ValueError(
                    "Different numbers of instances for inputs (x:%s, y:%s)" %
                    (num_inst_x, num_inst_y))

            if folds_train != folds_test or folds_test != folds_labeled or folds_labeled != folds_unlabeled:
                raise ValueError(
                    "Different numbers of folds for inputs (train_idx:%s, test_idx:%s "
                    "label_idx:%s, unlabel_idx:%s)" %
                    (folds_train, folds_test, folds_labeled, folds_unlabeled))
            if kfolds != folds_test:
                raise ValueError(
                    "Number of folds for inputs (train_idx:%s, test_idx:%s "
                    "label_idx:%s, unlabel_idx:%s) must be equals to kfolds:%s param"
                    % (folds_train, folds_test, folds_labeled, folds_unlabeled,
                       kfolds))

            if num_inst_train + num_inst_test != num_inst_x:
                raise ValueError(
                    "The sum of the number of instances for train_idx and test_idx must be equal to the "
                    "number of instances for x"
                    "(num_inst_x:%s, num_inst_train:%s num_inst_test:%s)" %
                    (num_inst_x, num_inst_train, num_inst_test))

            if num_inst_labeled + num_inst_unlabeled != num_inst_train:
                raise ValueError(
                    "The sum of the number of instances for label_idx and unlabel_idx must be equal to the "
                    "number of instances for train_idx"
                    "(num_inst_labeled:%s, num_inst_unlabeled:%s num_inst_unlabeled:%s)"
                    %
                    (num_inst_labeled, num_inst_unlabeled, num_inst_unlabeled))

            self._kfolds = folds_train
            self._train_idx = train_idx
            self._test_idx = test_idx
            self._label_idx = label_idx
            self._unlabel_idx = unlabel_idx

        self._ml_technique = ml_technique
        if self._ml_technique is None:
            raise ValueError("required param 'ml_technique' can not be empty")

        self._performance_metrics = performance_metrics
        if self._performance_metrics is None or len(
                self._performance_metrics) == 0:
            raise ValueError(
                "required param 'performance_metric' can not be empty")
        else:
            for metric in self._performance_metrics:
                if not isinstance(metric, BaseMetrics):
                    raise ValueError(
                        "the elements in 'performance_metrics' must be of type BaseMetrics"
                    )

        self._query_strategy = query_strategy
        if self._query_strategy is None:
            raise ValueError(
                "required param 'query_strategy' can not be empty")

        self._oracle = oracle
        if self._oracle is None:
            raise ValueError("required param 'simOracle' can not be empty")

        self._stopping_criteria = stopping_criteria
        if self._stopping_criteria is None:
            raise ValueError(
                "required param 'stopping_criteria' can not be empty")

        # Dynamically create the scenario Type given the arguments
        importlib.import_module(self._scenario_type.__module__)

        self._scenario = eval(self._scenario_type.__qualname__)(
            X=self._X,
            y=self._Y,
            train_idx=self._train_idx[0],
            test_idx=self._test_idx[0],
            label_idx=copy.deepcopy(IndexCollection(self._label_idx[0])),
            unlabel_idx=copy.deepcopy(IndexCollection(self._unlabel_idx[0])),
            ml_technique=self._ml_technique,
            performance_metrics=self._performance_metrics,
            query_strategy=self._query_strategy,
            oracle=self._oracle,
            batch_size=batch_size)