Exemple #1
0
def create_dataset_splits(data, labels):
    # Define X and y target
    X = data.values
    y = np.asarray(labels)

    # Create alibox tool box
    toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

    # Split data to train and test and keep only 0.01 of the original data as labeled
    toolbox.split_AL(test_ratio=0.15, initial_label_rate=0.1)
    train_idx, test_idx, labeled_idx, unlabeled_idx = toolbox.get_split(0)

    X_train = X[train_idx]
    y_train = y[train_idx]
    y_train = np.array(y_train).reshape(-1)
    X_test = X[test_idx]
    y_test = y[test_idx]
    y_test = np.array(y_test).reshape(-1)

    # Save dataset splits
    with open('dataset','wb') as f:
        pickle.dump((X_train, X_test, y_train, y_test), f)

    # Save dataset splits indexes for active learning
    with open('dataset_al', 'wb') as f:
        pickle.dump((train_idx, test_idx, labeled_idx, unlabeled_idx), f)
Exemple #2
0
 def __init__(self,
              dataset,
              labels,
              testset,
              testlab,
              model,
              phase,
              path,
              stopping,
              measure='nearest_neighbor',
              distance='linear'):
     self.dataset = dataset
     self.labels = labels
     self.testset = testset
     self.testlab = testlab
     self.model = model
     self.phase = phase
     self.classes = int(max(labels))
     self.alibox = ToolBox(X=dataset,
                           y=np.asarray([0] * len(labels), dtype=np.int),
                           query_type='AllLabels',
                           saving_path='./%s' % path)
     self.alibox.split_AL(test_ratio=0,
                          initial_label_rate=0.05,
                          split_count=1)
     self.stopping_criterion = self.alibox.get_stopping_criterion(
         stopping[0], value=stopping[1])
     self.measure = measure
     if measure == 'residue':
         self.query_strategy = QueryInstanceResidueRegressor(
             X=self.dataset, y=self.labels, distance=distance)
     else:
         self.query_strategy = QueryInstanceDistribution(measure=measure)
     self.random = QueryRandom()
     self.unc_result = []
     self.title = ''
     self.loss = []
     self.path = path
     self.one = self.two = self.three = self.four = self.five = self.six = None
     self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], []
     self.sample = []
Exemple #3
0
 def __init__(self, dataset, labels, testset, testlab, model, phase, path,
              stopping):
     self.dataset = dataset
     self.labels = labels
     self.testset = testset
     self.testlab = testlab
     self.model = model
     self.phase = phase
     self.classes = int(max(labels))
     self.alibox = ToolBox(X=dataset,
                           y=labels,
                           query_type='AllLabels',
                           saving_path='./%s' % path)
     self.alibox.split_AL(test_ratio=0,
                          initial_label_rate=0.05,
                          split_count=1)
     self.stopping_criterion = self.alibox.get_stopping_criterion(
         stopping[0], value=stopping[1])
     self.query_strategy = QueryInstanceUncertainty(
         X=dataset, y=labels, measure='least_confident')
     # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence')
     self.random = QueryRandom()
     self.unc_result = []
     self.title = ''
     self.acc = []
     self.gmeans = []
     self.recall = []
     self.precision = []
     self.specificity = []
     self.auc = []
     self.f1 = []
     self.pos = []
     self.neg = []
     self.ratio = []
     self.loss = []
     self.mcc = []
     self.path = path
Exemple #4
0
    def AC_(self, X, y):

        # X, y = shuffle(X, Y)
        # y = y.astype('int')
        alibox = ToolBox(X=X, y=y, query_type='AllLabels')

        alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

        model = alibox.get_default_model()

        # stopping_criterion = alibox.get_stopping_criterion('num_of_queries',50)

        model.fit(X, y)
        pred = model.predict(X)

        # 整理矩阵系数为信任度,返回start
        w = model.class_weight
        dim = w.shape[0]
        trustValue = []

        for i in range(0, dim):
            value = math.exp(w[i])  # exp() 方法返回x的指数,ex。
            trustValue.append(value)
        return trustValue
Exemple #5
0
                                      return_indicator='dense',
                                      return_distributions=False,
                                      random_state=None)
y[y == 0] = -1

# the cost of each class
cost = [1, 3, 3, 7, 10]

# if node_i is the parent of node_j , then label_tree(i,j)=1 else 0
label_tree = np.zeros((5, 5), dtype=np.int)
label_tree[0, 1] = 1
label_tree[0, 2] = 1
label_tree[1, 3] = 1
label_tree[2, 4] = 1

alibox = ToolBox(X=X, y=y, query_type='PartLabels')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# baseclassifier model use RFC
model = RandomForestClassifier()

# The budget of query
budget = 40

# The cost budget is 500
stopping_criterion = alibox.get_stopping_criterion('cost_limit', 500)

performance_result = []
halc_result = []
# model = RandomForestClassifier()
# model = SVC(gamma='auto')

for testdataset in testdatasetnames:
    print('***********currently dataset is : ', testdataset)

    lcdata_uncertainty_select_list = []
    lcdata_random_select_list = []

    # active learning 
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)
    train_indexs, test_indexs, label_indexs, unlabel_indexs = split_load('./experiment_result/combination_classify/australian_lrmetadata_0.005/australian/')
    alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path=savefloder_path + testdataset +'/', train_idx=train_indexs, test_idx=test_indexs, label_idx=label_indexs, unlabel_idx=unlabel_indexs)

    # Split data
    # alibox.split_AL(test_ratio=test_ratio, initial_label_rate=initial_label_ratio, split_count=splitcount)
    # alibox.
    


    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', query_num)

    # experiment
    # meta_regressor = joblib.load('meta_lr.joblib')
    # meta_regressor = sgdr
    # meta_result = []
    
Exemple #7
0
                           n_features=10,
                           n_informative=5,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=2,
                           n_clusters_per_class=2,
                           weights=None,
                           flip_y=0.01,
                           class_sep=1.0,
                           hypercube=True,
                           shift=0.0,
                           scale=1.0,
                           shuffle=True,
                           random_state=None)

alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)


def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
Exemple #8
0
# Print the list

all = np.asarray(X, dtype=float)
X = all[0:1020]
df = pd.read_csv("sentences_data.csv")[0:1020]
NANindex = df['Oracle label'].index[df['Oracle label'].apply(np.isnan)]
X = np.delete(X, NANindex, 0)
df = df.dropna(subset=['Oracle label'])
print(df)
print(len(X))
y = df[['Oracle label']].to_numpy().ravel().astype(int)
print(y)
print("siz", y.size)
# y = np.asarray([1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.1, split_count=3)

# Use the default Logistic Regression classifier
# model = sklearn.svm.SVC(kernel='sigmoid', probability=True)
# model=MLPClassifier(hidden_layer_sizes=(80,80),activation='logistic',solver='adam',max_iter=3000, alpha=0.01)
model = alibox.get_default_model()

# model = RandomForestClassifier()
# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', '100')


def main_loop(alibox, strategy, round):
Exemple #9
0
class TorchRegressionFold:
    def __init__(self,
                 dataset,
                 labels,
                 testset,
                 testlab,
                 model,
                 phase,
                 path,
                 stopping,
                 measure='nearest_neighbor',
                 distance='linear'):
        self.dataset = dataset
        self.labels = labels
        self.testset = testset
        self.testlab = testlab
        self.model = model
        self.phase = phase
        self.classes = int(max(labels))
        self.alibox = ToolBox(X=dataset,
                              y=np.asarray([0] * len(labels), dtype=np.int),
                              query_type='AllLabels',
                              saving_path='./%s' % path)
        self.alibox.split_AL(test_ratio=0,
                             initial_label_rate=0.05,
                             split_count=1)
        self.stopping_criterion = self.alibox.get_stopping_criterion(
            stopping[0], value=stopping[1])
        self.measure = measure
        if measure == 'residue':
            self.query_strategy = QueryInstanceResidueRegressor(
                X=self.dataset, y=self.labels, distance=distance)
        else:
            self.query_strategy = QueryInstanceDistribution(measure=measure)
        self.random = QueryRandom()
        self.unc_result = []
        self.title = ''
        self.loss = []
        self.path = path
        self.one = self.two = self.three = self.four = self.five = self.six = None
        self.max, self.mae, self.mse, self.evs, self.r2 = [], [], [], [], []
        self.sample = []

    def train(self):
        from sklearn.metrics import (mean_squared_log_error as msle, max_error
                                     as max, mean_absolute_error as mae,
                                     mean_squared_error as mse,
                                     explained_variance_score as evs, r2_score
                                     as r2, mean_tweedie_deviance as tweedie)
        for round in range(1):
            try:
                os.mkdir('%s/%d' % (self.path, round))
            except FileExistsError:
                pass

            # get data split of one fold
            train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split(
                round)
            # get intermediate results saver for one fold experiment
            saver = self.alibox.get_stateio(round)

            # set initial performance point
            model = self.model
            net = NN.NeuralNetworkRegressor(model=model,
                                            batch_size=1,
                                            device_ids=[0],
                                            epochs=50)
            net.lr_fc = 0.01

            net.initiate(self.dataset[label_ind.index],
                         self.labels[label_ind.index])

            net.predict(self.testset)
            pred = net.preds

            # evaluation
            all = len(label_ind) + len(unlab_ind)
            lab_init = len(label_ind)
            self.mse.append(mse(self.testlab, pred))
            self.mae.append(mae(self.testlab, pred))
            self.max.append(max(self.testlab, pred))
            self.evs.append(evs(self.testlab, pred))
            self.r2.append(r2(self.testlab, pred))
            self.sample.append(len(label_ind.index))

            saver.set_initial_point(mse(self.testlab, pred))
            iteration = 0

            while not self.stopping_criterion.is_stop():
                # select subsets of Uind samples according to query strategy
                iteration += 1

                lr_fc = net.lr_fc * (1 - len(label_ind.index) / (all * 1.001))
                for p in net.optimizer.param_groups:
                    p['lr'] = lr_fc
                print('learning rate is',
                      net.optimizer.state_dict()['param_groups'][0]['lr'])

                if self.phase == 'active':
                    if self.measure != 'residue':
                        net.predict(self.dataset[unlab_ind.index])
                    else:
                        net.predict(self.dataset[label_ind])
                    pred = net.preds

                    if self.measure == 'distance':
                        if iteration == 1:
                            self._update_previous_prediction(pred)
                        else:
                            self._update_previous_prediction(
                                pred, select_ind, unlab_ind_save)
                        previous = self._get_previous_prediction()
                    else:
                        previous = None

                    if len(label_ind) < all * 0.6:
                        if iteration % 10:
                            select_ind = self.query_strategy.select_by_prediction(
                                unlabel_index=unlab_ind,
                                predict=pred,
                                labels=self.labels[label_ind.index],
                                batch_size=int(lab_init * 1),
                                X_lab=self.dataset[label_ind.index],
                                X_unlab=self.dataset[unlab_ind.index],
                                previous=previous)
                        else:
                            select_ind = self.random.select(label_ind,
                                                            unlab_ind,
                                                            batch_size=int(
                                                                lab_init * 1))
                    else:
                        select_ind = self.query_strategy.select_by_prediction(
                            unlabel_index=unlab_ind,
                            predict=pred,
                            labels=self.labels[label_ind.index],
                            batch_size=int(len(label_ind) * 0.3),
                            X_lab=self.dataset[label_ind.index],
                            X_unlab=self.dataset[unlab_ind.index],
                            previous=previous)
                elif self.phase == 'passive':
                    if len(label_ind) < all * 0.6:
                        select_ind = self.random.select(label_ind,
                                                        unlab_ind,
                                                        batch_size=int(
                                                            lab_init * 1))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)
                    else:
                        select_ind = self.random.select(
                            label_ind,
                            unlab_ind,
                            batch_size=int(len(label_ind) * 0.3))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)

                # update the datasets and previous prediction
                unlab_ind_save = unlab_ind.index
                label_ind.update(select_ind)
                unlab_ind.difference_update(select_ind)

                # update model and calc performance accoding to the updated model
                loss = net.train(self.dataset[label_ind.index],
                                 self.labels[label_ind.index])

                # if not iteration%2:
                net.predict(self.testset)
                pred = net.preds

                # evaluation
                self.mse.append(mse(self.testlab, pred))
                self.mae.append(mae(self.testlab, pred))
                self.max.append(max(self.testlab, pred))
                self.evs.append(evs(self.testlab, pred))
                self.r2.append(r2(self.testlab, pred))
                self.sample.append(len(label_ind.index))
                self.loss.append(loss)

                # save the results
                st = self.alibox.State(select_ind, mse(self.testlab, pred))
                saver.add_state(st)
                saver.save()

                self.stopping_criterion.update_information(saver)
                torch.save(self.model,
                           './%s/%d/model%d' % (self.path, round, iteration))

            self.stopping_criterion.reset()
            self.unc_result.append(copy.deepcopy(saver))
            joblib.dump(self.mse, './%s/%d/mse' % (self.path, round))
            joblib.dump(self.mae, './%s/%d/mae' % (self.path, round))
            joblib.dump(self.max, './%s/%d/max' % (self.path, round))
            joblib.dump(self.evs, './%s/%d/evs' % (self.path, round))
            joblib.dump(self.r2, './%s/%d/r2' % (self.path, round))
            joblib.dump(self.sample, './%s/%d/sample' % (self.path, round))
            joblib.dump(self.loss, './%s/%d/loss' % (self.path, round))
            joblib.dump(self.testlab, './%s/%d/testlab' % (self.path, round))
            joblib.dump(pred, './%s/%d/pred' % (self.path, round))
        self.analyser = self.alibox.get_experiment_analyser(
            x_axis='num_of_queries')
        self.analyser.add_method(
            method_name='QueryInstanceDistribution-distance',
            method_results=self.unc_result)
        print(self.analyser)

    def _update_previous_prediction(self, new, selected=None, unlab=None):
        if self.six is not None: del_ind = [unlab.index(i) for i in selected]
        if self.two is not None: self.one = np.delete(self.two, del_ind)
        if self.three is not None: self.two = np.delete(self.three, del_ind)
        if self.four is not None: self.three = np.delete(self.four, del_ind)
        if self.five is not None: self.four = np.delete(self.five, del_ind)
        if self.six is not None: self.five = np.delete(self.six, del_ind)
        self.six = new

    def _get_previous_prediction(self):
        if self.one is not None:
            return np.vstack((self.one, self.two, self.three, self.four,
                              self.five, self.six))
        elif self.two is not None:
            return np.vstack(
                (self.two, self.three, self.four, self.five, self.six))
        elif self.three is not None:
            return np.vstack((self.three, self.four, self.five, self.six))
import copy
from sklearn.datasets import load_iris
from alipy import ToolBox

X, y = load_iris(return_X_y=True)
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)

# Use pre-defined strategy
QBCStrategy = alibox.get_query_strategy(strategy_name='QueryInstanceQBC')
QBC_result = []

for round in range(10):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)

    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = QBCStrategy.select(label_ind,
                                        unlab_ind,
Exemple #11
0
    # gbr_r2 = r2_score(testmetadata[:, 396], gbr_pred)
    # print('In the ' + testdataset + 'GradientBoostingRegressor r2_score is : ', gbr_r2)
    # if gbr_performance is None:
    #     gbr_performance = np.array([testdataset, gbr_mse, gbr_mae, gbr_r2])
    # else:
    #     gbr_performance = np.vstack((gbr_performance, [testdataset, gbr_mse, gbr_mae, gbr_r2]))
    # joblib.dump(gbr, testdataset + "meta_gbr.joblib")

    # active learning
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)

    alibox = ToolBox(X=X,
                     y=y,
                     query_type='AllLabels',
                     saving_path='./experiment_result/' + testdataset + '/')

    # Split data
    alibox.split_AL(test_ratio=0.3, initial_label_rate=0.05, split_count=5)

    # Use the default Logistic Regression classifier
    model = alibox.get_default_model()

    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 30)

    # experiment
    # meta_regressor = joblib.load('meta_lr.joblib')
    # meta_regressor = sgdr
    # meta_result = []
Exemple #12
0
                           n_features=20,
                           n_informative=2,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=2,
                           n_clusters_per_class=2,
                           weights=None,
                           flip_y=0.01,
                           class_sep=1.0,
                           hypercube=True,
                           shift=0.0,
                           scale=1.0,
                           shuffle=True,
                           random_state=None)

alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)

QBC_result = []


def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
Exemple #13
0
from alipy.experiment import StoppingCriteria
from alipy import ToolBox
import numpy as np

X = np.random.rand(30, 5)
y = np.random.randint(2, size=30)
alibox = ToolBox(X=X, y=y)

# ---------------Initialize----------------
stopping_criterion = StoppingCriteria(stopping_criteria='num_of_queries',
                                      value=50)
# or init by toolbox
stopping_criterion = alibox.get_stopping_criterion(
    stopping_criteria='num_of_queries', value=50)

# ---------------Usage----------------
while not stopping_criterion.is_stop():
    #... Query some examples and update the StateIO object
    # Use the StateIO object to update stopping_criterion object
    saver = alibox.get_stateio(round=0)
    stopping_criterion.update_information(saver)
# The condition is met and break the loop.
# Reset the object for another fold.
stopping_criterion.reset()
Exemple #14
0
class TorchFold:
    def __init__(self, dataset, labels, testset, testlab, model, phase, path,
                 stopping):
        self.dataset = dataset
        self.labels = labels
        self.testset = testset
        self.testlab = testlab
        self.model = model
        self.phase = phase
        self.classes = int(max(labels))
        self.alibox = ToolBox(X=dataset,
                              y=labels,
                              query_type='AllLabels',
                              saving_path='./%s' % path)
        self.alibox.split_AL(test_ratio=0,
                             initial_label_rate=0.05,
                             split_count=1)
        self.stopping_criterion = self.alibox.get_stopping_criterion(
            stopping[0], value=stopping[1])
        self.query_strategy = QueryInstanceUncertainty(
            X=dataset, y=labels, measure='least_confident')
        # self.query_strategy = QueryInstanceQBC(disagreement='KL_divergence')
        self.random = QueryRandom()
        self.unc_result = []
        self.title = ''
        self.acc = []
        self.gmeans = []
        self.recall = []
        self.precision = []
        self.specificity = []
        self.auc = []
        self.f1 = []
        self.pos = []
        self.neg = []
        self.ratio = []
        self.loss = []
        self.mcc = []
        self.path = path

    def train(self):
        for round in range(1):
            try:
                os.mkdir('%s/%d' % (self.path, round))
            except FileExistsError:
                pass

            # get data split of one fold
            train_idx, test_idx, label_ind, unlab_ind = self.alibox.get_split(
                round)
            # get intermediate results saver for one fold experiment
            saver = self.alibox.get_stateio(round)

            # set initial performance point
            model = self.model
            # print(torch.cuda.current_device())
            # print(torch.cuda.device_count(), torch.cuda.is_available())
            net = NN.NeuralNetwork(model=model,
                                   num_classes=2,
                                   batch_size=500,
                                   device_ids=[0],
                                   epochs=50)
            net.lr_fc = 0.0001

            net.initiate(self.dataset[label_ind.index],
                         self.labels[label_ind.index])

            net.predict(self.testset)
            pred = net.preds
            weight = []

            conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred)
            precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1])
            recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0])
            specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1])
            gmeans = sqrt(recall * specificity)
            f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred)
            auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred)
            accuracy = self.alibox.calc_performance_metric(
                y_true=self.testlab,
                y_pred=pred.reshape(list(self.testlab.shape)),
                performance_metric='accuracy_score')
            self.auc.append(auc)
            self.acc.append(accuracy)
            self.f1.append(f1)
            self.gmeans.append(gmeans)
            self.recall.append(recall)
            self.precision.append(precision)
            self.specificity.append(specificity)
            all = len(label_ind) + len(unlab_ind)
            lab_init = len(label_ind)
            lab = list(self.labels[label_ind.index])
            self.pos.append(lab.count(1))
            self.neg.append(lab.count(0))
            self.ratio.append(lab.count(0) / lab.count(1))
            tn, tp, fp, fn = conf_mat[0,
                                      0], conf_mat[1,
                                                   1], conf_mat[0,
                                                                1], conf_mat[1,
                                                                             0]
            mcc = ((tn * tp) - (fn * fp)) / sqrt(
                (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn))
            self.mcc.append(mcc)

            saver.set_initial_point(gmeans)
            iteration = 0

            while not self.stopping_criterion.is_stop():
                # select subsets of Uind samples according to query strategy
                iteration += 1

                if self.phase == 'active':
                    net.predict(self.dataset[unlab_ind.index])
                    prob_pred = net.probablistic_matrix()

                    if len(label_ind) < all * 0.3:
                        if iteration % 10:
                            select_ind = self.query_strategy.select_by_prediction_mat(
                                unlabel_index=unlab_ind,
                                predict=prob_pred,
                                batch_size=int(lab_init * 0.4))
                            # batch_size=1)
                        else:
                            select_ind = self.random.select(
                                label_ind,
                                unlab_ind,
                                batch_size=int(lab_init * 0.4))
                            # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)
                    else:
                        select_ind = self.query_strategy.select_by_prediction_mat(
                            unlabel_index=unlab_ind,
                            predict=prob_pred,
                            batch_size=int(len(label_ind) * 0.4))
                        # batch_size=1)
                elif self.phase == 'passive':
                    if len(label_ind) < all * 0.3:
                        select_ind = self.random.select(label_ind,
                                                        unlab_ind,
                                                        batch_size=int(
                                                            lab_init * 0.4))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)
                    else:
                        select_ind = self.random.select(
                            label_ind,
                            unlab_ind,
                            batch_size=int(len(label_ind) * 0.4))
                        # select_ind = self.random.select(label_ind, unlab_ind, batch_size=1)

                # print(select_ind)
                label_ind.update(select_ind)
                unlab_ind.difference_update(select_ind)

                # update model and calc performance accoding to the updated model
                loss = net.train(self.dataset[label_ind.index],
                                 self.labels[label_ind.index])

                # if not iteration%2:
                net.predict(self.testset)
                pred = net.preds

                conf_mat = confusion_matrix(y_true=self.testlab, y_pred=pred)
                precision = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[0, 1])
                recall = conf_mat[1, 1] / (conf_mat[1, 1] + conf_mat[1, 0])
                specificity = conf_mat[0,
                                       0] / (conf_mat[0, 0] + conf_mat[0, 1])
                gmeans = sqrt(recall * specificity)
                f1 = metrics.f1_score(y_true=self.testlab, y_pred=pred)
                auc = metrics.roc_auc_score(y_true=self.testlab, y_score=pred)
                accuracy = self.alibox.calc_performance_metric(
                    y_true=self.testlab,
                    y_pred=pred.reshape(list(self.testlab.shape)),
                    performance_metric='accuracy_score')
                self.auc.append(auc)
                self.acc.append(accuracy)
                self.f1.append(f1)
                self.gmeans.append(gmeans)
                self.recall.append(recall)
                self.precision.append(precision)
                self.specificity.append(specificity)
                lab = list(self.labels[label_ind.index])
                self.pos.append(lab.count(1))
                self.neg.append((lab.count(0)))
                self.ratio.append(lab.count(0) / lab.count(1))
                self.loss.append(loss)
                tn, tp, fp, fn = conf_mat[0, 0], conf_mat[1, 1], conf_mat[
                    0, 1], conf_mat[1, 0]
                mcc = ((tn * tp) - (fn * fp)) / sqrt(
                    (tn + fp) * (tn + fn) * (tp + fp) * (tp + fn))
                self.mcc.append(mcc)

                # save the results
                st = self.alibox.State(select_ind, gmeans)
                saver.add_state(st)
                saver.save()

                self.stopping_criterion.update_information(saver)
                lab = list(self.labels[label_ind.index])
                print('\n class \n0 and 1\n', lab.count(0), lab.count(1))
                print('\n', conf_mat)
                torch.save(self.model,
                           './%s/%d/model%d' % (self.path, round, iteration))

            self.stopping_criterion.reset()
            self.unc_result.append(copy.deepcopy(saver))
            joblib.dump(self.auc, './%s/%d/auc' % (self.path, round))
            joblib.dump(self.acc, './%s/%d/acc' % (self.path, round))
            joblib.dump(self.f1, './%s/%d/f1' % (self.path, round))
            joblib.dump(self.gmeans, './%s/%d/gmeans' % (self.path, round))
            joblib.dump(self.recall, './%s/%d/recall' % (self.path, round))
            joblib.dump(self.precision,
                        './%s/%d/precision' % (self.path, round))
            joblib.dump(self.specificity,
                        './%s/%d/specificity' % (self.path, round))
            joblib.dump(self.pos, './%s/%d/pos' % (self.path, round))
            joblib.dump(self.neg, './%s/%d/neg' % (self.path, round))
            joblib.dump(self.ratio, './%s/%d/ratio' % (self.path, round))
            joblib.dump(self.mcc, './%s/%d/mcc' % (self.path, round))
        self.analyser = self.alibox.get_experiment_analyser(
            x_axis='num_of_queries')
        self.analyser.add_method(method_name='QueryInstanceUncertaity-lc',
                                 method_results=self.unc_result)
        print(self.analyser)
Exemple #15
0
import copy
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import OneHotEncoder
from alipy.query_strategy.query_type import QueryTypeAURO
from alipy.query_strategy.multi_label import LabelRankingModel
from alipy.index.multi_label_tools import get_Xy_in_multilabel
from alipy import ToolBox

X, y = load_iris(return_X_y=True)
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())
mult_y[mult_y == 0] = -1

alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)

# query type strategy
AURO_results = []

for round in range(5):

    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    query_y = mult_y.copy()
    AURO_strategy = QueryTypeAURO(X=X, y=mult_y)
    # base model
    model = LabelRankingModel()
Exemple #16
0
def create_and_implement_strategy(strategy_name, data, labels, queries):

    # Keep only the values of data and labels dataframe (Later, we use the global split based on idxs)
    X = data.values
    y = np.asarray(labels)
    toolbox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

    # Create Logistic Regression model ( Default Setting with liblinear solver)
    model = toolbox.get_default_model()

    # Implement query strategy
    uncertainty_strategy = toolbox.get_query_strategy(strategy_name=strategy_name)

    # Create array to save the results
    examples = []

    # Set stopping criterion, we will stop in 1000 labeled examples
    stopping_criterion = toolbox.get_stopping_criterion('num_of_queries', queries)

    # Get the indexes of the global split
    with open("dataset_al", "rb") as f:
        train_idx, test_idx, labeled_idx, unlabeled_idx = pickle.load(f)

    # Create saver to save the results
    saver = StateIO(round=0, train_idx=train_idx,
                    test_idx=test_idx, init_L=labeled_idx,
                    init_U=unlabeled_idx, saving_path='.')

    # print(train_idx.shape, test_idx.shape)

    # Starting with some labeled examples
    model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index])
    y_pred = model.predict(X[test_idx, :])

    # Calculate the accuracy of the prediction
    accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='accuracy_score')

    # Save accuracy of the prediction
    saver.set_initial_point(accuracy)

    while not stopping_criterion.is_stop():
        # Select example of the unlabeled dataset
        example = uncertainty_strategy.select(labeled_idx, unlabeled_idx, model=model, batch_size=1)
        # Update the label idxs
        labeled_idx.update(example)
        unlabeled_idx.difference_update(example)
        # Train model for the added example
        model.fit(X=X[labeled_idx.index, :], y=y[labeled_idx.index])
        y_pred = model.predict(X[test_idx, :])
        # Calculate accuracy
        accuracy = toolbox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred,
                                                   performance_metric='accuracy_score')
        # f1 = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=y_pred, performance_metric='f1_score')

        # Save update results
        state = toolbox.State(select_index=example, performance=accuracy)
        saver.add_state(state)
        saver.save()

        # Update progress for stopping criterion
        stopping_criterion.update_information(saver)

    stopping_criterion.reset()
    examples.append(copy.deepcopy(saver))

    # Uncomment and return in order to save the new active learning dataset
    # Save selected x_train examples
    X_train = X[labeled_idx, :]
    # Save labels for the examples
    y_train = y[labeled_idx, :]
    # Reshape target
    y_train = np.array(y_train).reshape(-1)

    # Save to pickle
    # with open('qbc_dataset','wb') as f:
    #     pickle.dump((X_train, y_train), f)

    return examples
initial_label_ratio = 0.005

savefloder_path = './experiment_result/classical_active_learning/'

model = LogisticRegression(solver='lbfgs')

for testdataset in testdatasetnames:
    print('***********currently dataset is : ', testdataset)
    # prepare dataset
    dt = DataSet(testdataset, dataset_path)
    X = dt.X
    y = dt.y.ravel()
    y = np.asarray(y, dtype=int)

    alibox = ToolBox(X=X,
                     y=y,
                     query_type='AllLabels',
                     saving_path=savefloder_path + testdataset + '/')
    # Split data
    alibox.split_AL(test_ratio=test_ratio,
                    initial_label_rate=initial_label_ratio,
                    split_count=splitcount)

    # The cost budget is 50 times querying
    stopping_criterion = alibox.get_stopping_criterion('num_of_queries',
                                                       query_num)

    def main_loop(alibox, strategy, round):
        # Get the data split of one fold experiment
        train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
        # Get intermediate results saver for one fold experiment
        saver = alibox.get_stateio(round)
Exemple #18
0
from sklearn.datasets import load_iris
from alipy import ToolBox

X, y = load_iris(return_X_y=True)
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# get tools
tr, te, lab, unlab = alibox.split_AL()
tr0, te0, lab0, unlab0 = alibox.get_split(round=0)
oracle = alibox.get_clean_oracle()
saver = alibox.get_stateio(round=0)
repo = alibox.get_repository(round=0)
rand_strategy = alibox.get_query_strategy(strategy_name="QueryInstanceRandom")
perf = alibox.calc_performance_metric(y_true=[1], y_pred=[1], performance_metric='accuracy_score')
model = alibox.get_default_model()
sc = alibox.get_stopping_criterion(stopping_criteria='num_of_queries', value=50)
analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
acethread = alibox.get_ace_threading()

# data struct defined in alipy
ind = alibox.IndexCollection([1, 2, 3])
m_ind = alibox.MultiLabelIndexCollection([(1, 0), (2, )])
st = alibox.State(select_index=[1], performance=perf)

# io
alibox.save()
# al_settings.pkl is the default name. To use another name, please pass a specific file name
# to 'saving_path' parameter when initializing the ToolBox object. (e.g., saving_path='./my_file.pkl')
alibox = ToolBox.load(path='./al_settings.pkl')
Exemple #19
0
test_data = test_data.drop(["FaultCause"], axis=1)
all_data = pd.concat([train_data, test_data], axis=0)
all_label = pd.concat([train_label, test_label], axis=0)
all_data = all_data.values
all_label = all_label.values
all_resampled_data, all_resampled_label = SMOTE().fit_resample(
    all_data, all_label)

all_data = all_resampled_data
all_label = all_resampled_label

for index in range(0, len(all_label)):
    all_label[index] = all_label[index] - 1

alibox = ToolBox(X=all_data,
                 y=all_label,
                 query_type='AllLabels',
                 saving_path='.')
alibox.split_AL(test_ratio=0.7, initial_label_rate=0.001, split_count=1)
model = alibox.get_default_model()
# model = AdaBoostClassifier(n_estimators=10)
# model = XGBClassifier(objective="reg:logistic")
# model = LogisticRegression()

# rft = SVC(kernel='linear')
# knn = KNeighborsClassifier(n_neighbors=7)

stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 500)


def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
Exemple #20
0
from alipy import ToolBox
from alipy.query_strategy.multi_label import *

X, y = load_iris(return_X_y=True)
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())

# Or generate a dataset with any sizes
# X, mult_y = make_multilabel_classification(n_samples=5000, n_features=20, n_classes=5, length=5)

# Since we are using the label ranking model, the label 0 means unknown. we need to
# set the 0 entries to -1 which means irrelevant.
mult_y[mult_y == 0] = -1

alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)


def main_loop(alibox, round, strategy):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # base model
    model = LabelRankingModel()

    # A simple stopping criterion to specify the query budget.
    while len(label_ind) <= 120:
        # query and update
        select_labs = strategy.select(label_ind, unlab_ind)
        # use cost to record the amount of queried instance-label pairs
Exemple #21
0
df = pd.read_excel('Juliet_Test_Suite/combined_data_table.xlsx')
df = encode_and_bind(df, 'Clang Rule')
df = encode_and_bind(df, 'CodeSonar Rule')
df = encode_and_bind(df, 'Severity')
df = encode_and_bind(df, 'CWE')
df.dropna(subset=['True Positive'], inplace=True)
df = df.reindex()
X = df.drop('True Positive', axis=1)
y = df.loc[:, 'True Positive']

#change these parameters to alter experiment
init_labels = 0.005  #initial label rate
trn_tst_split = 0.2  # train test split portion
stop = 300  #number of queries to execute

alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.005, split_count=3)
# model=LogisticRegression(penalty='l1',solver='liblinear')
model = RandomForestClassifier(n_estimators=100)
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 300)
uncertainStrategy = alibox.get_query_strategy(
    strategy_name='QueryInstanceUncertainty')
unc_result = []
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(0)
saver = alibox.get_stateio(0)
# print(y.loc[label_ind.index])
model.fit(X=X.values[label_ind.index, :], y=y.values[label_ind.index])
while not stopping_criterion.is_stop():
    select_ind = uncertainStrategy.select(label_ind,
                                          unlab_ind,
                                          model=model,