# Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = QBCStrategy.select(label_ind,
                                        unlab_ind,
                                        model=None,
                                        batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(
            y_true=y[test_idx],
            y_pred=pred,
            performance_metric='accuracy_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)
        saver.save()

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    QBC_result.append(copy.deepcopy(saver))

analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
analyser.add_method(method_name='QBC', method_results=QBC_result)
print(analyser)
analyser.plot_learning_curves(title='Example of AL', std_area=True)
예제 #2
0
def create_and_implement_strategy(strategy_name, data, labels, queries):

    # Keep only the values of data and labels dataframe (Later, we use the global split based on idxs)
    X = data.values
    y = np.asarray(labels)
    toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

    # Create Logistic Regression model ( Default Setting with liblinear solver)
    model = toolbox.get_default_model()

    # Implement query strategy
    uncertainty_strategy = toolbox.get_query_strategy(strategy_name=strategy_name)

    # Create array to save the results
    examples = []

    # Set stopping criterion, we will stop in 1000 labeled examples
    stopping_criterion = toolbox.get_stopping_criterion('num_of_queries', queries)

    # Get the indexes of the global split
    with open("dataset_al", "rb") as f:
        train_idx, test_idx, labeled_idx, unlabeled_idx = pickle.load(f)

    # Create saver to save the results
    saver = StateIO(round=0, train_idx=train_idx,
                    test_idx=test_idx, init_L=labeled_idx,
                    init_U=unlabeled_idx, saving_path='.')

    # print(train_idx.shape, test_idx.shape)

    # Starting with some labeled examples
    model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index])
    y_pred = model.predict(X[test_idx, :])

    # Calculate the accuracy of the prediction
    accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score')

    # Save accuracy of the prediction
    saver.set_initial_point(accuracy)

    while not stopping_criterion.is_stop():
        # Select example of the unlabeled dataset
        example = uncertainty_strategy.select(labeled_idx, unlabeled_idx, model=model, batch_size=1)
        # Update the label idxs
        labeled_idx.update(example)
        unlabeled_idx.difference_update(example)
        # Train model for the added example
        model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index])
        y_pred = model.predict(X[test_idx, :])
        # Calculate accuracy
        accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred,
                                                   performance_metric='accuracy_score')
        # f1 = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='f1_score')

        # Save update results
        state = toolbox.State(select_index=example, performance=accuracy)
        saver.add_state(state)
        saver.save()

        # Update progress for stopping criterion
        stopping_criterion.update_information(saver)

    stopping_criterion.reset()
    examples.append(copy.deepcopy(saver))

    # Uncomment and return in order to save the new active learning dataset
    # Save selected x_train examples
    X_train = X[labeled_idx, :]
    # Save labels for the examples
    y_train = y[labeled_idx, :]
    # Reshape target
    y_train = np.array(y_train).reshape(-1)

    # Save to pickle
    # with open('qbc_dataset','wb') as f:
    #     pickle.dump((X_train, y_train), f)

    return examples
예제 #3
0
            query_y[select_ins, select_y1] = 0.5
            query_y[select_ins, select_y2] = 1

        # record results
        label_ind.update([(select_ins, select_y1), (select_ins, select_y2)])
        unlab_ind.difference_update([(select_ins, select_y1),
                                     (select_ins, select_y2)])

        if iter % 5 == 0:
            # train/test
            X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=query_y)
            model.fit(X=X_tr, y=y_tr)
            pres, pred = model.predict(X[test_idx])

            perf = alibox.calc_performance_metric(
                y_true=mult_y[test_idx],
                y_pred=pred,
                performance_metric='hamming_loss')

            # save
            st = alibox.State(select_index=[(select_ins, select_y1),
                                            (select_ins, select_y2)],
                              performance=perf)
            saver.add_state(st)

    AURO_results.append(copy.copy(saver))

analyser = alibox.get_experiment_analyser()
analyser.add_method(method_name='AURO', method_results=AURO_results)
analyser.plot_learning_curves()
예제 #4
0
from sklearn.datasets import load_iris
from alipy import ToolBox

X, y = load_iris(return_X_y=True)
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# get tools
tr, te, lab, unlab = alibox.split_AL()
tr0, te0, lab0, unlab0 = alibox.get_split(round=0)
oracle = alibox.get_clean_oracle()
saver = alibox.get_stateio(round=0)
repo = alibox.get_repository(round=0)
rand_strategy = alibox.get_query_strategy(strategy_name="QueryInstanceRandom")
perf = alibox.calc_performance_metric(y_true=[1], y_pred=[1], performance_metric='accuracy_score')
model = alibox.get_default_model()
sc = alibox.get_stopping_criterion(stopping_criteria='num_of_queries', value=50)
analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
acethread = alibox.get_ace_threading()

# data struct defined in alipy
ind = alibox.IndexCollection([1, 2, 3])
m_ind = alibox.MultiLabelIndexCollection([(1, 0), (2, )])
st = alibox.State(select_index=[1], performance=perf)

# io
alibox.save()
# al_settings.pkl is the default name. To use another name, please pass a specific file name
# to 'saving_path' parameter when initializing the ToolBox object. (e.g., saving_path='./my_file.pkl')
alibox = ToolBox.load(path='./al_settings.pkl')
예제 #5
0
class TorchFold:
    def __init__(self, dataset, labels, testset, testlab, model, phase, path,
                 stopping):
        self.dataset = dataset
        self.labels = labels
        self.testset = testset
        self.testlab = testlab
        self.model = model
        self.phase = phase
        self.classes = int(max(labels))
        self.alibox = ToolBox(X=dataset,
                              y=labels,
                              query_type='AllLabels',
                              saving_path='./%s' % path)
        self.alibox.split_AL(test_ratio=0,
                             initial_label_rate=0.05,
                             split_count=1)
        self.stopping_criterion = self.alibox.get_stopping_criterion(
            stopping[0], value=stopping[1])
        self.query_strategy = QueryInstanceUncertainty(
            X=dataset, y=labels, measure='least_confident')
        # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence')
        self.random = QueryRandom()
        self.unc_result = []
        self.title = ''
        self.acc = []
        self.gmeans = []
        self.recall = []
        self.precision = []
        self.specificity = []
        self.auc = []
        self.f1 = []
        self.pos = []
        self.neg = []
        self.ratio = []
        self.loss = []
        self.mcc = []
        self.path = path

    def train(self):
        for round in range(1):
            try:
                os.mkdir('%s/%d' % (self.path, round))
            except FileExistsError:
                pass

            # get data split of one fold
            train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split(
                round)
            # get intermediate results saver for one fold experiment
            saver = self.alibox.get_stateio(round)

            # set initial performance point
            model = self.model
            # print(torch.cuda.current_device())
            # print(torch.cuda.device_count(), torch.cuda.is_available())
            net = NN.NeuralNetwork(model=model,
                                   num_classes=2,
                                   batch_size=500,
                                   device_ids=[0],
                                   epochs=50)
            net.lr_fc = 0.0001

            net.initiate(self.dataset[label_ind.index],
                         self.labels[label_ind.index])

            net.predict(self.testset)
            pred = net.preds
            weight = []

            conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred)
            precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1])
            recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0])
            specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1])
            gmeans = sqrt(recall * specificity)
            f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred)
            auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred)
            accuracy = self.alibox.calc_performance_metric(
                y_true=self.testlab,
                y_pred=pred.reshape(list(self.testlab.shape)),
                performance_metric='accuracy_score')
            self.auc.append(auc)
            self.acc.append(accuracy)
            self.f1.append(f1)
            self.gmeans.append(gmeans)
            self.recall.append(recall)
            self.precision.append(precision)
            self.specificity.append(specificity)
            all = len(label_ind) + len(unlab_ind)
            lab_init = len(label_ind)
            lab = list(self.labels[label_ind.index])
            self.pos.append(lab.count(1))
            self.neg.append(lab.count(0))
            self.ratio.append(lab.count(0) / lab.count(1))
            tn, tp, fp, fn = conf_mat[0,
                                      0], conf_mat[1,
                                                   1], conf_mat[0,
                                                                1], conf_mat[1,
                                                                             0]
            mcc = ((tn * tp) - (fn * fp)) / sqrt(
                (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn))
            self.mcc.append(mcc)

            saver.set_initial_point(gmeans)
            iteration = 0

            while not self.stopping_criterion.is_stop():
                # select subsets of Uind samples according to query strategy
                iteration += 1

                if self.phase == 'active':
                    net.predict(self.dataset[unlab_ind.index])
                    prob_pred = net.probablistic_matrix()

                    if len(label_ind) < all * 0.3:
                        if iteration % 10:
                            select_ind = self.query_strategy.select_by_prediction_mat(
                                unlabel_index=unlab_ind,
                                predict=prob_pred,
                                batch_size=int(lab_init * 0.4))
                            # batch_size=1)
                        else:
                            select_ind = self.random.select(
                                label_ind,
                                unlab_ind,
                                batch_size=int(lab_init * 0.4))
                            # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)
                    else:
                        select_ind = self.query_strategy.select_by_prediction_mat(
                            unlabel_index=unlab_ind,
                            predict=prob_pred,
                            batch_size=int(len(label_ind) * 0.4))
                        # batch_size=1)
                elif self.phase == 'passive':
                    if len(label_ind) < all * 0.3:
                        select_ind = self.random.select(label_ind,
                                                        unlab_ind,
                                                        batch_size=int(
                                                            lab_init * 0.4))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)
                    else:
                        select_ind = self.random.select(
                            label_ind,
                            unlab_ind,
                            batch_size=int(len(label_ind) * 0.4))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)

                # print(select_ind)
                label_ind.update(select_ind)
                unlab_ind.difference_update(select_ind)

                # update model and calc performance accoding to the updated model
                loss = net.train(self.dataset[label_ind.index],
                                 self.labels[label_ind.index])

                # if not iteration%2:
                net.predict(self.testset)
                pred = net.preds

                conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred)
                precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1])
                recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0])
                specificity = conf_mat[0,
                                       0] / (conf_mat[0, 0] + conf_mat[0, 1])
                gmeans = sqrt(recall * specificity)
                f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred)
                auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred)
                accuracy = self.alibox.calc_performance_metric(
                    y_true=self.testlab,
                    y_pred=pred.reshape(list(self.testlab.shape)),
                    performance_metric='accuracy_score')
                self.auc.append(auc)
                self.acc.append(accuracy)
                self.f1.append(f1)
                self.gmeans.append(gmeans)
                self.recall.append(recall)
                self.precision.append(precision)
                self.specificity.append(specificity)
                lab = list(self.labels[label_ind.index])
                self.pos.append(lab.count(1))
                self.neg.append((lab.count(0)))
                self.ratio.append(lab.count(0) / lab.count(1))
                self.loss.append(loss)
                tn, tp, fp, fn = conf_mat[0, 0], conf_mat[1, 1], conf_mat[
                    0, 1], conf_mat[1, 0]
                mcc = ((tn * tp) - (fn * fp)) / sqrt(
                    (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn))
                self.mcc.append(mcc)

                # save the results
                st = self.alibox.State(select_ind, gmeans)
                saver.add_state(st)
                saver.save()

                self.stopping_criterion.update_information(saver)
                lab = list(self.labels[label_ind.index])
                print('\n class \n0 and 1\n', lab.count(0), lab.count(1))
                print('\n', conf_mat)
                torch.save(self.model,
                           './%s/%d/model%d' % (self.path, round, iteration))

            self.stopping_criterion.reset()
            self.unc_result.append(copy.deepcopy(saver))
            joblib.dump(self.auc, './%s/%d/auc' % (self.path, round))
            joblib.dump(self.acc, './%s/%d/acc' % (self.path, round))
            joblib.dump(self.f1, './%s/%d/f1' % (self.path, round))
            joblib.dump(self.gmeans, './%s/%d/gmeans' % (self.path, round))
            joblib.dump(self.recall, './%s/%d/recall' % (self.path, round))
            joblib.dump(self.precision,
                        './%s/%d/precision' % (self.path, round))
            joblib.dump(self.specificity,
                        './%s/%d/specificity' % (self.path, round))
            joblib.dump(self.pos, './%s/%d/pos' % (self.path, round))
            joblib.dump(self.neg, './%s/%d/neg' % (self.path, round))
            joblib.dump(self.ratio, './%s/%d/ratio' % (self.path, round))
            joblib.dump(self.mcc, './%s/%d/mcc' % (self.path, round))
        self.analyser = self.alibox.get_experiment_analyser(
            x_axis='num_of_queries')
        self.analyser.add_method(method_name='QueryInstanceUncertaity-lc',
                                 method_results=self.unc_result)
        print(self.analyser)
예제 #6
0
class TorchRegressionFold:
    def __init__(self,
                 dataset,
                 labels,
                 testset,
                 testlab,
                 model,
                 phase,
                 path,
                 stopping,
                 measure='nearest_neighbor',
                 distance='linear'):
        self.dataset = dataset
        self.labels = labels
        self.testset = testset
        self.testlab = testlab
        self.model = model
        self.phase = phase
        self.classes = int(max(labels))
        self.alibox = ToolBox(X=dataset,
                              y=np.asarray([0] * len(labels), dtype=np.int),
                              query_type='AllLabels',
                              saving_path='./%s' % path)
        self.alibox.split_AL(test_ratio=0,
                             initial_label_rate=0.05,
                             split_count=1)
        self.stopping_criterion = self.alibox.get_stopping_criterion(
            stopping[0], value=stopping[1])
        self.measure = measure
        if measure == 'residue':
            self.query_strategy = QueryInstanceResidueRegressor(
                X=self.dataset, y=self.labels, distance=distance)
        else:
            self.query_strategy = QueryInstanceDistribution(measure=measure)
        self.random = QueryRandom()
        self.unc_result = []
        self.title = ''
        self.loss = []
        self.path = path
        self.one = self.two = self.three = self.four = self.five = self.six = None
        self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], []
        self.sample = []

    def train(self):
        from sklearn.metrics import (mean_squared_log_error as msle, max_error
                                     as max, mean_absolute_error as mae,
                                     mean_squared_error as mse,
                                     explained_variance_score as evs, r2_score
                                     as r2, mean_tweedie_deviance as tweedie)
        for round in range(1):
            try:
                os.mkdir('%s/%d' % (self.path, round))
            except FileExistsError:
                pass

            # get data split of one fold
            train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split(
                round)
            # get intermediate results saver for one fold experiment
            saver = self.alibox.get_stateio(round)

            # set initial performance point
            model = self.model
            net = NN.NeuralNetworkRegressor(model=model,
                                            batch_size=1,
                                            device_ids=[0],
                                            epochs=50)
            net.lr_fc = 0.01

            net.initiate(self.dataset[label_ind.index],
                         self.labels[label_ind.index])

            net.predict(self.testset)
            pred = net.preds

            # evaluation
            all = len(label_ind) + len(unlab_ind)
            lab_init = len(label_ind)
            self.mse.append(mse(self.testlab, pred))
            self.mae.append(mae(self.testlab, pred))
            self.max.append(max(self.testlab, pred))
            self.evs.append(evs(self.testlab, pred))
            self.r2.append(r2(self.testlab, pred))
            self.sample.append(len(label_ind.index))

            saver.set_initial_point(mse(self.testlab, pred))
            iteration = 0

            while not self.stopping_criterion.is_stop():
                # select subsets of Uind samples according to query strategy
                iteration += 1

                lr_fc = net.lr_fc * (1 - len(label_ind.index) / (all * 1.001))
                for p in net.optimizer.param_groups:
                    p['lr'] = lr_fc
                print('learning rate is',
                      net.optimizer.state_dict()['param_groups'][0]['lr'])

                if self.phase == 'active':
                    if self.measure != 'residue':
                        net.predict(self.dataset[unlab_ind.index])
                    else:
                        net.predict(self.dataset[label_ind])
                    pred = net.preds

                    if self.measure == 'distance':
                        if iteration == 1:
                            self._update_previous_prediction(pred)
                        else:
                            self._update_previous_prediction(
                                pred, select_ind, unlab_ind_save)
                        previous = self._get_previous_prediction()
                    else:
                        previous = None

                    if len(label_ind) < all * 0.6:
                        if iteration % 10:
                            select_ind = self.query_strategy.select_by_prediction(
                                unlabel_index=unlab_ind,
                                predict=pred,
                                labels=self.labels[label_ind.index],
                                batch_size=int(lab_init * 1),
                                X_lab=self.dataset[label_ind.index],
                                X_unlab=self.dataset[unlab_ind.index],
                                previous=previous)
                        else:
                            select_ind = self.random.select(label_ind,
                                                            unlab_ind,
                                                            batch_size=int(
                                                                lab_init * 1))
                    else:
                        select_ind = self.query_strategy.select_by_prediction(
                            unlabel_index=unlab_ind,
                            predict=pred,
                            labels=self.labels[label_ind.index],
                            batch_size=int(len(label_ind) * 0.3),
                            X_lab=self.dataset[label_ind.index],
                            X_unlab=self.dataset[unlab_ind.index],
                            previous=previous)
                elif self.phase == 'passive':
                    if len(label_ind) < all * 0.6:
                        select_ind = self.random.select(label_ind,
                                                        unlab_ind,
                                                        batch_size=int(
                                                            lab_init * 1))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)
                    else:
                        select_ind = self.random.select(
                            label_ind,
                            unlab_ind,
                            batch_size=int(len(label_ind) * 0.3))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)

                # update the datasets and previous prediction
                unlab_ind_save = unlab_ind.index
                label_ind.update(select_ind)
                unlab_ind.difference_update(select_ind)

                # update model and calc performance accoding to the updated model
                loss = net.train(self.dataset[label_ind.index],
                                 self.labels[label_ind.index])

                # if not iteration%2:
                net.predict(self.testset)
                pred = net.preds

                # evaluation
                self.mse.append(mse(self.testlab, pred))
                self.mae.append(mae(self.testlab, pred))
                self.max.append(max(self.testlab, pred))
                self.evs.append(evs(self.testlab, pred))
                self.r2.append(r2(self.testlab, pred))
                self.sample.append(len(label_ind.index))
                self.loss.append(loss)

                # save the results
                st = self.alibox.State(select_ind, mse(self.testlab, pred))
                saver.add_state(st)
                saver.save()

                self.stopping_criterion.update_information(saver)
                torch.save(self.model,
                           './%s/%d/model%d' % (self.path, round, iteration))

            self.stopping_criterion.reset()
            self.unc_result.append(copy.deepcopy(saver))
            joblib.dump(self.mse, './%s/%d/mse' % (self.path, round))
            joblib.dump(self.mae, './%s/%d/mae' % (self.path, round))
            joblib.dump(self.max, './%s/%d/max' % (self.path, round))
            joblib.dump(self.evs, './%s/%d/evs' % (self.path, round))
            joblib.dump(self.r2, './%s/%d/r2' % (self.path, round))
            joblib.dump(self.sample, './%s/%d/sample' % (self.path, round))
            joblib.dump(self.loss, './%s/%d/loss' % (self.path, round))
            joblib.dump(self.testlab, './%s/%d/testlab' % (self.path, round))
            joblib.dump(pred, './%s/%d/pred' % (self.path, round))
        self.analyser = self.alibox.get_experiment_analyser(
            x_axis='num_of_queries')
        self.analyser.add_method(
            method_name='QueryInstanceDistribution-distance',
            method_results=self.unc_result)
        print(self.analyser)

    def _update_previous_prediction(self, new, selected=None, unlab=None):
        if self.six is not None: del_ind = [unlab.index(i) for i in selected]
        if self.two is not None: self.one = np.delete(self.two, del_ind)
        if self.three is not None: self.two = np.delete(self.three, del_ind)
        if self.four is not None: self.three = np.delete(self.four, del_ind)
        if self.five is not None: self.four = np.delete(self.five, del_ind)
        if self.six is not None: self.five = np.delete(self.six, del_ind)
        self.six = new

    def _get_previous_prediction(self):
        if self.one is not None:
            return np.vstack((self.one, self.two, self.three, self.four,
                              self.five, self.six))
        elif self.two is not None:
            return np.vstack(
                (self.two, self.three, self.four, self.five, self.six))
        elif self.three is not None:
            return np.vstack((self.three, self.four, self.five, self.six))