Beispiel #1
0
def main_loop(alibox, round, strategy):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # init model
    X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind,
                                         X=X,
                                         y=mult_y,
                                         unknown_element=0)
    model.fit(X=X_tr, y=y_tr)

    ini_lab_num = len(label_ind)
    # A simple stopping criterion to specify the query budget.
    while len(label_ind) - ini_lab_num <= 120:
        # query and update
        if isinstance(strategy, QueryMultiLabelAUDI):
            # If you are using a label ranking model, pass it to AUDI. It can
            # avoid re-training a label ranking model inside the algorithm
            select_labs = strategy.select(label_ind, unlab_ind, model=model)
        else:
            select_labs = strategy.select(label_ind, unlab_ind)
        # use cost to record the amount of queried instance-label pairs
        if len(select_labs[0]) == 1:
            cost = mult_y.shape[1]
        else:
            cost = len(select_labs)
        label_ind.update(select_labs)
        unlab_ind.difference_update(select_labs)

        # train/test
        X_tr, y_tr, _ = get_Xy_in_multilabel(select_labs,
                                             X=X,
                                             y=mult_y,
                                             unknown_element=0)
        model.fit(X=X_tr, y=y_tr, is_incremental=True)
        pres, pred = model.predict(X[test_idx])
        # using sklearn to calc micro-f1
        pred[pred == -1] = 0
        perf = f1_score(y_true=mult_y_for_metric[test_idx],
                        y_pred=pred,
                        average='micro')

        # save
        st = alibox.State(select_index=select_labs,
                          performance=perf,
                          cost=cost)
        saver.add_state(st)
        saver.save()

    return copy.deepcopy(saver)
Beispiel #2
0
def main_loop(alibox, round, strategy):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    query_y = mult_y.copy()
    # base model
    model = LabelRankingModel()

    while len(label_ind) <= 120:
        # query and update
        select_labs = strategy.select(label_ind, unlab_ind)
        # use cost to record the amount of queried instance-label pairs
        if len(select_labs[0]) == 1:
            cost = mult_y.shape[1]
        else:
            cost = len(select_labs)
        label_ind.update(select_labs)
        unlab_ind.difference_update(select_labs)

        # train/test
        X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y)
        model.fit(X=X_tr, y=y_tr)
        pres, pred = model.predict(X[test_idx])
        perf = alibox.calc_performance_metric(
            y_true=mult_y[test_idx],
            y_pred=pred,
            performance_metric='hamming_loss')

        # save
        st = alibox.State(select_index=select_labs,
                          performance=perf,
                          cost=cost)
        saver.add_state(st)

    return copy.deepcopy(saver)
Beispiel #3
0
def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        select_ind = strategy.select(label_ind,
                                     unlab_ind,
                                     cost=cost,
                                     budget=budget)
        #
        select_ind = hierarchical_multilabel_mark(select_ind, label_ind,
                                                  label_tree, y)

        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=y)
        model.fit(X_tr, y_tr)
        pred = model.predict(X[test_idx, :])
        pred[pred == 0] = 1

        performance = alibox.calc_performance_metric(
            y_true=y[test_idx], y_pred=pred, performance_metric='hamming_loss')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind.index,
                          performance=performance,
                          cost=budget)
        saver.add_state(st)
        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    return saver
Beispiel #4
0
            query_y[select_ins, select_y2] = -1
        elif y1 >= y2:
            query_y[select_ins, select_y1] = 1
            query_y[select_ins, select_y2] = 0.5
        else:
            query_y[select_ins, select_y1] = 0.5
            query_y[select_ins, select_y2] = 1

        # record results
        label_ind.update([(select_ins, select_y1), (select_ins, select_y2)])
        unlab_ind.difference_update([(select_ins, select_y1),
                                     (select_ins, select_y2)])

        if iter % 5 == 0:
            # train/test
            X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=query_y)
            model.fit(X=X_tr, y=y_tr)
            pres, pred = model.predict(X[test_idx])

            perf = alibox.calc_performance_metric(
                y_true=mult_y[test_idx],
                y_pred=pred,
                performance_metric='hamming_loss')

            # save
            st = alibox.State(select_index=[(select_ins, select_y1),
                                            (select_ins, select_y2)],
                              performance=perf)
            saver.add_state(st)

    AURO_results.append(copy.copy(saver))
Beispiel #5
0
    def select(self, label_index, unlabel_index, epsilon=0.5, **kwargs):
        """Select a subset from the unlabeled set, return the selected instance and label.

        Parameters
        ----------
        label_index: {list, np.ndarray, MultiLabelIndexCollection}
            The indexes of labeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.

        unlabel_index: {list, np.ndarray, MultiLabelIndexCollection}
            The indexes of unlabeled samples. It should be a 1d array of indexes (column major, start from 0) or
            MultiLabelIndexCollection or a list of tuples with 2 elements, in which,
            the 1st element is the index of instance and the 2nd element is the index of labels.

        epsilon: float, optional (default=0.5)
            The threshold to avoid zero-division.

        Returns
        -------
        selected_ins_lab_pair: list
            A list of tuples that contains the indexes of selected instance-label pairs.
        """
        if len(unlabel_index) <= 1:
            return unlabel_index
        unlabel_index = self._check_multi_label_ind(unlabel_index)
        label_index = self._check_multi_label_ind(label_index)

        # select instance by LCI
        W = unlabel_index.get_matrix_mask(mat_shape=self.y.shape,
                                          fill_value=1,
                                          sparse=False)
        unlab_data, _, data_ind = get_Xy_in_multilabel(index=unlabel_index,
                                                       X=self.X,
                                                       y=self.y)
        lab_data, lab_lab, _ = get_Xy_in_multilabel(index=label_index,
                                                    X=self.X,
                                                    y=self.y)
        self._lr_model.fit(lab_data, lab_lab)
        pres, labels = self._lr_model.predict(unlab_data)
        avgP = np.mean(
            np.sum(self.y[label_index.get_unbroken_instances(), :] == 1,
                   axis=1))
        insvals = -np.abs(
            (np.sum(labels == 1, axis=1) - avgP) /
            np.fmax(np.sum(W[data_ind, :] == 1, axis=1), epsilon))
        selected_ins = np.argmin(insvals)

        # last line in pres is the predict value of dummy label
        # select label by calculating the distance between each label with dummy label

        # set the known entries to inf
        pres_mask = np.asarray(1 - W[data_ind], dtype=bool)
        pres_tmp = pres[:, 0:-1]
        pres_tmp[pres_mask] = np.NINF
        pres[:, 0:-1] = pres_tmp

        dis = np.abs(pres[selected_ins, 0:-1] - pres[selected_ins, -1])
        selected_ins = data_ind[selected_ins]
        selected_lab = np.argmin(dis)

        return [(selected_ins, selected_lab)]
Beispiel #6
0
alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)
model = LabelRankingModel()  # base model

# query type strategy
AURO_results = []

for round in range(5):

    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    query_y = mult_y.copy()  # for labeling `less relevant`
    AURO_strategy = QueryTypeAURO(X=X, y=mult_y)
    # init model
    X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y)
    model.fit(X=X_tr, y=y_tr)

    for iter in range(100):

        select_ins, select_y1, select_y2 = AURO_strategy.select(label_ind,
                                                                unlab_ind,
                                                                model=model,
                                                                y_mat=query_y)

        # relevance
        y1 = mult_y[select_ins, select_y1]
        y2 = mult_y[select_ins, select_y2]
        if y1 < 0 and y2 < 0:
            query_y[select_ins, select_y1] = -1
            query_y[select_ins, select_y2] = -1