Ejemplo n.º 1
0
    def main_loop(alibox, strategy, round):
        # Get the data split of one fold experiment
        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)

        # train_idx = train_indexs[round]
        # test_idx = test_indexs[round]
        # label_ind = label_indexs[round]
        # unlab_ind = unlabel_indexs[round]
        # Get intermediate results saver for one fold experiment
        saver = alibox.get_stateio(round)

        # To balance such effects that QueryMeta need to select the first five rounds selection
        temp_rand = QueryRandom(X, y)
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        for i in range(5):
            rand_select_ind = temp_rand.select(label_ind, unlab_ind) 
            label_ind.update(rand_select_ind)
            unlab_ind.difference_update(rand_select_ind)
            model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        # label_ind = copy.deepcopy(label_index_round[round][4])
        # unlab_ind = copy.deepcopy(unlabel_index_round[round][4])

        # model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = sum(pred == y[test_idx]) / len(test_idx)
        saver.set_initial_point(accuracy)

        while not stopping_criterion.is_stop():
            # Select a subset of Uind according to the query strategy
            # Passing model=None to use the default model for evaluating the committees' disagreement
            select_ind = strategy.select(label_ind, unlab_ind, model=model, batch_size=1)
            label_ind.update(select_ind)
            unlab_ind.difference_update(select_ind)

            # Update model and calc performance according to the model you are using
            model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
            pred = model.predict(X[test_idx, :])
            accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                    y_pred=pred,
                                                    performance_metric='accuracy_score')

            # Save intermediate results to file
            st = alibox.State(select_index=select_ind, performance=accuracy)
            saver.add_state(st)

            # Passing the current progress to stopping criterion object
            stopping_criterion.update_information(saver)
        # Reset the progress in stopping criterion object
        stopping_criterion.reset()
        return saver
Ejemplo n.º 2
0
                y_true=y[test_idx],
                y_pred=pred,
                performance_metric='accuracy_score')

            # Save intermediate results to file
            st = alibox.State(select_index=select_ind, performance=accuracy)
            saver.add_state(st)
            saver.save()

            # Passing the current progress to stopping criterion object
            stopping_criterion.update_information(saver)
        # Reset the progress in stopping criterion object
        stopping_criterion.reset()
        meta_result.append(copy.deepcopy(saver))

    random = QueryRandom(X, y)
    random_result = []

    for round in range(5):
        # Get the data split of one fold experiment
        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
        # Get intermediate results saver for one fold experiment
        saver = alibox.get_stateio(round)
        # calc the initial point
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = sum(pred == y[test_idx]) / len(test_idx)
        saver.set_initial_point(accuracy)

        while not stopping_criterion.is_stop():
            # Select a subset of Uind according to the query strategy
Ejemplo n.º 3
0
    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num)

    # generate the first five rounds data(label_index unlabel_index model_output)
    label_index_round = []
    unlabel_index_round = []
    model_output_round = []

    for round in range(splitcount):
        label_inds_5 = []
        unlabel_inds_5 = []
        model_output_5 = []

        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
        temp_rand = QueryRandom(X, y)
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        for i in range(5):
            rand_select_ind = temp_rand.select(label_ind, unlab_ind) 
            label_ind.update(rand_select_ind)
            unlab_ind.difference_update(rand_select_ind)
            label_inds_5.append(copy.deepcopy(label_ind))
            unlabel_inds_5.append(copy.deepcopy(unlab_ind))
            model.fit(X=X[label_ind.index, :], y=y[label_ind.index])  
            if hasattr(model, 'predict_proba'):
                output = (model.predict_proba(X)[:, 1] - 0.5) * 2
            else:
                output = model.predict(X)    
            model_output_5.append(output)
        
        label_index_round.append(label_inds_5)
            saver.add_state(st)
            # Passing the current progress to stopping criterion object
            stopping_criterion.update_information(saver)
        # Reset the progress in stopping criterion object
        stopping_criterion.reset()
        return saver

    random_result = []
    unc_result = []
    qbc_result = []

    for round in range(splitcount):
        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)

        # Use pre-defined strategy
        random = QueryRandom(X, y)
        unc = QueryInstanceUncertainty(X, y)
        qbc = QueryInstanceQBC(X, y)

        random_result.append(copy.deepcopy(main_loop(alibox, random, round)))
        unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
        qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))

    analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
    analyser.add_method(method_name='QBC', method_results=qbc_result)
    analyser.add_method(method_name='Unc', method_results=unc_result)
    analyser.add_method(method_name='random', method_results=random_result)

    plt = analyser.plot_learning_curves(title=testdataset,
                                        std_area=False,
                                        saving_path=savefloder_path +
Ejemplo n.º 5
0
    def select(self,
               label_index,
               unlabel_index,
               model=None,
               xb_way='uncertainty'):
        """Select indexes from the unlabel_index for querying.

        Parameters
        ----------
        label_index: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples.

        unlabel_index: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples.

        model: object, optional (default=None)
            Current classification model, should have the 'predict_proba' method for probabilistic output.
            If not provided, LogisticRegression with default parameters implemented by sklearn will be used.

        Returns
        -------
        selected_idx: int
            The selected index.
        """
        if model is None:
            model = LogisticRegression()
        if self.flag is False:
            self.get_5_rouds(label_index, unlabel_index, model)

        label_ind = copy.deepcopy(self.label_inds_5[4])
        unlabel_ind = copy.deepcopy(self.unlabel_inds_5[4])

        # select x^ by unncertainty for combining the [x*, x^] c_data
        # using uncertainty to select x^
        if xb_way is 'uncertainty':
            un = QueryInstanceUncertainty(self.X, self.y)
            selectedind = un.select(label_ind, unlabel_ind, model)
        elif xb_way is 'random':
            rand = QueryRandom(self.X, self.y)
            selectedind = rand.select(label_ind, unlabel_ind)[0]
        else:
            raise Exception(
                'calculating the xb at least one of [uncertrainty, random]')

        # using random to select x^
        # rand = QueryRandom(self.X, self.y)
        # rand_selectedind = rand.select(label_ind, unlabel_ind)

        # cd_second = meta_data(self.X, self.y, self.distacne, self.cluster_center_index, self.label_inds_5, self.unlabel_inds_5, self.modelOutput_5, un_selectedind)
        metadata = self.cal_mate_data_Z(self.label_inds_5, self.unlabel_inds_5,
                                        self.modelOutput_5, model)

        # if np.where(self.unlabel_inds_5[4] == un_selectedind)[0] > 0:
        #     metadata_unind = np.where(self.unlabel_inds_5[4] == un_selectedind)[0][0]
        #     cd_second = metadata[metadata_unind]
        # else:
        #     l_ind = copy.deepcopy(self.label_inds_5[4])
        #     u_ind = copy.deepcopy(self.unlabel_inds_5[4])
        #     l_ind.

        # metadata_unind = np.where(self.unlabel_inds_5[4] == selectedind)[0][0]

        metadata_unind = np.where(unlabel_ind == selectedind)[0][0]
        cd_second = metadata[metadata_unind]
        num_unlabeled = len(metadata)
        cd_second = np.tile(cd_second, [num_unlabeled, 1])
        combination_data = np.c_[metadata, cd_second]

        predict_proba = self.cb_classifier.predict_proba(combination_data)
        select = np.argmax(predict_proba[:, 1])
        # metareg_perdict = self.metaregressor.predict(metadata)
        # print('len(metareg_perdict) ',len(metareg_perdict))
        # select = np.argmax(metareg_perdict)
        # print('select ',select)
        # print('len(unlabel_ind)',len(unlabel_ind))

        select_ind = unlabel_ind[select]
        label_ind.update(select_ind)
        unlabel_ind.difference_update(select_ind)
        model.fit(X=self.X[label_index.index, :], y=self.y[label_index.index])

        # update the five rounds infor before
        del self.label_inds_5[0]
        del self.unlabel_inds_5[0]
        del self.modelOutput_5[0]

        self.label_inds_5.append(label_ind)
        self.unlabel_inds_5.append(unlabel_ind)
        if hasattr(model, 'predict_proba'):
            output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2
        else:
            output = model.predict(self.X)
        self.modelOutput_5.append(output)

        return select_ind, copy.deepcopy(self.label_inds_5[4]), copy.deepcopy(
            self.unlabel_inds_5[4])
Ejemplo n.º 6
0
    def get_5_rouds(self,
                    label_ind,
                    unlabel_ind,
                    Model,
                    querystategy='random'):
        """
        label_ind: {list, np.ndarray, IndexCollection}
            The indexes of labeled samples.

        unlabel_ind: {list, np.ndarray, IndexCollection}
            The indexes of unlabeled samples.

        model: object, optional (default=None)
            Current classification model, should have the 'predict_proba' method for probabilistic output.
            If not provided, LogisticRegression with default parameters implemented by sklearn will be used.
        
        querystategy: str, default='uncertainty'
            In the first five rounds of active learning,choose to select the query strategy.
            Currently only supported uncertainty and random
        """
        assert (isinstance(label_ind, IndexCollection))
        assert (isinstance(unlabel_ind, IndexCollection))
        label_index = copy.deepcopy(label_ind)
        unlabel_index = copy.deepcopy(unlabel_ind)
        model = copy.deepcopy(Model)

        if querystategy == 'uncertainty':
            un = QueryInstanceUncertainty(self.X, self.y)
            for _ in range(5):
                select_ind = un.select(label_index, unlabel_index, model=model)
                label_index.update(select_ind)
                unlabel_index.difference_update(select_ind)
                self.label_inds_5.append(copy.deepcopy(label_index))
                self.unlabel_inds_5.append(copy.deepcopy(unlabel_index))
                model.fit(X=self.X[label_index.index, :],
                          y=self.y[label_index.index])
                self.modelOutput_5.append(model.predict(self.X))

        elif querystategy == 'random':
            random = QueryRandom(self.X, self.y)
            for _ in range(5):
                select_ind = random.select(label_index, unlabel_index)
                label_index.update(select_ind)
                unlabel_index.difference_update(select_ind)
                self.label_inds_5.append(copy.deepcopy(label_index))
                self.unlabel_inds_5.append(copy.deepcopy(unlabel_index))
                model.fit(X=self.X[label_index.index, :],
                          y=self.y[label_index.index])

                if hasattr(model, 'predict_proba'):
                    output = (model.predict_proba(self.X)[:, 1] - 0.5) * 2
                else:
                    output = model.predict(self.X)

                # self.modelOutput_5.append(model.predict(self.X))
                self.modelOutput_5.append(output)

        elif querystategy is None:
            for _ in range(5):
                num_label = len(label_index.index)
                num_unlabel = len(unlabel_index.index)
                n_samples = np.shape(self.X)[0]
                self.label_inds_5.append(np.zeros(num_label))
                self.unlabel_inds_5.append(np.zeros(num_unlabel))
                self.modelOutput_5.append(np.zeros(n_samples))
        self.flag = True
Ejemplo n.º 7
0
    def set_query_strategy(self,
                           strategy="QueryInstanceUncertainty",
                           **kwargs):
        """
            Set the query strategy of the experiment.

        Parameters
        ----------
        strategy: {str, callable}, optional (default='QueryInstanceUncertainty')
            The query strategy function.
            Giving str to use a pre-defined strategy.
            Giving callable to use a user-defined strategy.

        kwargs: dict, optional
            The args used in strategy.
            If kwargs is None,the pre-defined query strategy will init in default way.
            (See the default way of pre-defined query strategy in the alipy/query_strategy/'query_strategy' and 'sota_strategy').
            If strategy is a user-define strategy,the parameters accord with definition of function parameter.

            Note that, each parameters should be static.
            The parameters will be fed to the callable object automatically.
        """
        # check
        if self._existed_query_strategy:
            raise Exception(
                "You already has set the query strategy,don`t has to set it again."
            )
        # user-defined strategy
        if callable(strategy):
            self.__custom_strategy_flag = True
            strategyname = kwargs.pop('strategyname', None)
            if strategyname is not None:
                self._query_function_name = strategyname
            else:
                self._query_function_name = 'user-defined strategy'
            self.__custom_func_arg = kwargs
            self._query_function = strategy(self._X, self._y, **kwargs)
        else:
            # a pre-defined strategy in ALiPy
            if strategy not in ['QueryInstanceQBC', 'QueryInstanceUncertainty', 'QueryRandom', \
                                'QureyExpectedErrorReduction', 'QueryInstanceGraphDensity', 'QueryInstanceQUIRE', \
                                'QueryInstanceBMDR', 'QueryInstanceSPAL', 'QueryInstanceLAL']:
                raise NotImplementedError(
                    'Strategy {} is not implemented. Specify a valid '
                    'method name or privide a callable object.'.format(
                        str(strategy)))
            else:
                self._query_function_name = strategy
                if strategy == 'QueryInstanceQBC':
                    method = kwargs.pop('method', 'query_by_bagging')
                    disagreement = kwargs.pop('disagreement', 'vote_entropy')
                    self._query_function = QueryInstanceQBC(
                        self._X, self._y, method, disagreement)
                elif strategy == 'QueryInstanceUncertainty':
                    measure = kwargs.pop('measure', 'entropy')
                    self._query_function = QueryInstanceUncertainty(
                        self._X, self._y, measure)
                elif strategy == 'QueryRandom':
                    self._query_function = QueryRandom(self._X, self._y)
                elif strategy == 'QureyExpectedErrorReduction':
                    self._query_function = QureyExpectedErrorReduction(
                        self._X, self._y)
                elif strategy == 'QueryInstanceGraphDensity' or strategy == 'QueryInstanceQUIRE':
                    if self._train_idx is None:
                        raise ValueError(
                            'train_idx is None.Please split data firstly.You can call set_data_split or split_AL to split data.'
                        )
                    self._query_function_need_train_ind = True
                    self._query_function_metric = kwargs.pop(
                        'metric', 'manhattan')
                    self._query_function_kwargs = kwargs
                elif strategy == 'QueryInstanceBMDR':
                    beta = kwargs.pop('beta', 1000)
                    gamma = kwargs.pop('gamma', 0.1)
                    rho = kwargs.pop('rho', 1)
                    self._query_function = QueryInstanceBMDR(
                        self._X, self._y, beta, gamma, rho, **kwargs)
                    self.qp_solver = kwargs.pop('qp_sover', 'ECOS')
                elif strategy == 'QueryInstanceSPAL':
                    mu = kwargs.pop('mu', 0.1)
                    gamma = kwargs.pop('gamma', 0.1)
                    rho = kwargs.pop('rho', 1)
                    lambda_init = kwargs.pop('lambda_init', 0.1)
                    lambda_pace = kwargs.pop('lambda_pace', 0.01)
                    self._query_function = QueryInstanceSPAL(
                        self._X, self._y, mu, gamma, rho, lambda_init,
                        lambda_pace, **kwargs)
                    self.qp_solver = kwargs.pop('qp_sover', 'ECOS')
                elif strategy == 'QueryInstanceLAL':
                    mode = kwargs.pop('mode', 'LAL_iterative')
                    data_path = kwargs.pop('data_path', '.')
                    cls_est = kwargs.pop('cls_est', 50)
                    train_slt = kwargs.pop('train_slt', True)
                    self._query_function = QueryInstanceLAL(
                        self._X, self._y, mode, data_path, cls_est, train_slt,
                        **kwargs)