Exemple #1
0
    def learn(self):       
        # seeding
        classes = self.short_df['grades_round'].unique()
        seed_index = []
        for i in classes:
            seed_index.append(self.short_df['grades_round'][self.short_df['grades_round']==i].index[0])
        seed_index

        act_data = self.short_df.copy()
        accuracy_list = []

        # initialising
        train_idx = seed_index
        X_train = self.X[train_idx]
        y_train = self.Y[train_idx]

        # generating the pool
        X_pool = np.delete(self.X, train_idx, axis=0)
        y_pool = np.delete(self.Y, train_idx)

        act_data = act_data.drop(axis=0,index = train_idx)
        act_data.reset_index(drop = True,inplace=True)
        
        initiated_committee = []
        for learner_idx,model in enumerate(self.learners):
            learner = ActiveLearner(
                estimator=model,
                X_training=X_train, y_training=y_train
            )
            initiated_committee.append(learner)
        # Commitee creation
        committee = Committee(
            learner_list= initiated_committee,
#             query_strategy=vote_entropy_sampling
        )
        
        committee.teach(X_train,y_train)
        
        # pool-based sampling
        n_queries = int(len(X)/(100/self.percent))
        for idx in range(n_queries):
            query_idx = np.random.choice(range(len(X_pool))) 
            committee.teach(
                X=X_pool[query_idx].reshape(1, -1),
                y=y_pool[query_idx].reshape(1, )
            )

            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

            act_data = act_data.drop(axis=0,index = query_idx)
            act_data.reset_index(drop=True, inplace=True)

            accuracy_list.append(accuracy_score(committee.predict(X_pool),y_pool))
#             print('Accuracy after query no. %d: %f' % (idx+1, accuracy_score(committee.predict(X_pool),y_pool)))
        print("By just labelling ",round(n_queries*100.0/len(X),2),"% of total data accuracy of ", round(accuracy_score(committee.predict(X_pool),y_pool),3), " % is achieved on the unseen data" )
        model_pred = committee.predict(X_pool)
        model_f1 = f1_score(y_pool,model_pred,average='weighted')
        return accuracy_list,model_f1        
Exemple #2
0
def run(X, y, n_samples_for_intial, n_queries, n_comittee_members, estimator):
    # start timer
    start_time = time.time()

    # init list of different learners 
    learners = []

    X_train, y_train, X_pool, y_pool = create_random_pool_and_initial_sets(X, y, n_samples_for_intial)

    for member_idx in range(n_comittee_members):
        learners.append(ActiveLearner(estimator=estimator, X_training=X_train, y_training=y_train))
        
    # init committee
    committee = Committee(learner_list=learners, query_strategy=max_disagreement_sampling)

    unqueried_score = committee.score(X, y)
    print('Score over unqueried samples {:0.4f}'.format(unqueried_score))

    performance_history = []

    f1_score = 0
    index = 0
    while f1_score < 0.65:
        index += 1

        # get sample from pool
        query_idx, query_instance = committee.query(X_pool)

        # retrain comittee with new sample
        committee.teach(
            X=X_pool[query_idx].reshape(1, -1),
            y=y_pool[query_idx].reshape(1, )
        )

        # remove queried instance from pool
        X_pool = delete_rows_csr(X_pool, query_idx)
        y_pool = np.delete(y_pool, query_idx)

        y_pred = committee.predict(X)
        f1_score = metrics.f1_score(y, y_pred, average='micro')

        if index % 100 == 0:
            print('F1 score after {n} training samples: {f1:0.4f}'.format(n=index, f1=f1_score))

        # save accuracy score
        performance_history.append(f1_score)
    print("--- %s seconds ---" % (time.time() - start_time))

    print(performance_history)
    return index
def cmte_loop(estimator1,estimator2,X_0,Y_0,X_train,Y_train,X_test,Y_test,indexs,n=5):
    #learners = []
    X_pool = deepcopy(np.delete(X_train,indexs,axis=0))
    Y_pool = deepcopy(np.delete(Y_train,indexs))
    #committee = Committee2Level(estimator1,estimator2,X_0,Y_0)
    committee = Committee([ActiveLearner(e,X_training=X_0,y_training=Y_0) for e in estimator1])
    accuracies = []
    while len(X_pool)>0:
        #query_indxs,_ = committee.query(X_pool,Y_pool)
        query_indxs,_ = committee.query(X_pool) 
        committee.teach(X_pool[query_indxs],Y_pool[query_indxs])
        
        X_0 = np.append(X_0,X_pool[query_indxs],axis=0)
        Y_0 = np.append(Y_0,Y_pool[query_indxs][0])
        X_pool = np.delete(X_pool,query_indxs,axis=0)
        Y_pool = np.delete(Y_pool,query_indxs,axis=0)
        accuracies.append(evaluate(committee,X_0,Y_0,X_test,Y_test))
    return (committee,accuracies)
def modAL_QBC(X, y, n_queries):
    learner_list = [
        ActiveLearner(LogisticRegression(solver='liblinear',
                                         n_jobs=1,
                                         multi_class='ovr'),
                      X_training=X[[0, 50, 100]],
                      y_training=y[[0, 50, 100]]),
        ActiveLearner(LogisticRegression(solver='liblinear',
                                         n_jobs=1,
                                         multi_class='ovr'),
                      X_training=X[[0, 50, 100]],
                      y_training=y[[0, 50, 100]])
    ]

    modAL_learner = Committee(learner_list)

    for _ in range(n_queries):
        query_idx, query_inst = modAL_learner.query(X)
        modAL_learner.teach(X[query_idx], y[query_idx])
Exemple #5
0
def cmte_loop(estimator,
              X_0,
              Y_0,
              X_train,
              Y_train,
              X_test,
              Y_test,
              indexs,
              n=5):
    learners = []
    print(len(indexs))
    X_pool = deepcopy(np.delete(X_train, indexs, axis=0))
    Y_pool = deepcopy(np.delete(Y_train, indexs))
    for i in range(len(estimator)):
        learners.append(
            ActiveLearner(estimator=estimator[i],
                          X_training=X_0,
                          y_training=Y_0))
    committee = Committee(learner_list=learners)
    index = 0
    accuracies = []
    while len(X_pool) > 0:
        query_indxs, _ = committee.query(X_pool)
        if len(query_indxs) > 1:
            raise Exception("NOOOOOOOOOOOOOOO")
        committee.teach(X=X_pool[query_indxs], y=Y_pool[query_indxs])
        X_0 = np.append(X_0, X_pool[query_indxs], axis=0)
        Y_0 = np.append(Y_0, Y_pool[query_indxs][0])
        X_pool = np.delete(X_pool, query_indxs, axis=0)
        Y_pool = np.delete(Y_pool, query_indxs, axis=0)
        accuracies.append(evaluate(committee, X_0, Y_0, X_test, Y_test))
        #model_accuracy = 1- committee.score(X_0,Y_0)
        #print('Error after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))
        index += 1
        #predicts = committee.predict(X_test)
        #corrects = (predicts==Y_test)
        #accs =1 - sum([1 if i else 0 for i in corrects])/len(predicts)
        #print(accs)
        #plts_train.append(model_accuracy)
        #plts_test.append(accs)

    return (committee, accuracies)
class Committee2Level:
    def __init__(self,estimators_1,estimators_2,X,Y):
        indexs2 = list(filter(lambda i: Y[i]!=2,range(len(Y))))
        X1 = X
        Y1 = np.array([i if i==2 else 1 for i in Y])
        X2 = X[indexs2]
        Y2 = Y[indexs2]
        learners1 = [ActiveLearner(e,X_training=X1,y_training=Y1) for e in estimators_1]
        learners2 = [ActiveLearner(e,X_training=X2,y_training=Y2) for e in estimators_2]
        self.osCommittee = Committee(learners1)
        self.pnCommittee = Committee(learners2)
        self.choice_indx = 0
    def teach(self,X,Y):
        indexs2 = list(filter(lambda i: Y[i]!=2,range(len(Y))))
        X1 = X
        Y1 = np.array([i if i==2 else 1 for i in Y])
        X2 = X[indexs2]
        Y2 = Y[indexs2]
        self.osCommittee.teach(X1,Y1)
        if len(Y2) > 0:
            self.pnCommittee.teach(X2,Y2)
    def predict(self,X):
        predicts1 = self.osCommittee.predict(X)
        indexs = range(len(X))
        predicts2 = self.pnCommittee.predict(X)
        predicts = [predicts1[i] if predicts1[i]==2 else predicts2[i] for i in indexs]
        return predicts
    def query(self,X,Y):
        indexs2 = list(filter(lambda i: Y[i]!=2,range(len(Y))))
        X1 = X
        X2 = X[indexs2]
        q2 = ([],2)
        if len(X2)>0:
            q2 = self.pnCommittee.query(X2)
        alternatives = [self.osCommittee.query(X1), q2]
        r = alternatives[self.choice_indx%2]
        #print(self.choice_indx)
        self.choice_indx +=1
        if len(r[0]) == 0:
            return alternatives[self.choice_indx%2]
        return r
    # initial training data
    n_initial = 5
    train_idx = np.random.choice(range(X_pool.shape[0]),
                                 size=n_initial,
                                 replace=False)
    X_train = X_pool[train_idx]
    y_train = y_pool[train_idx]

    # creating a reduced copy of the data with the known instances removed
    X_pool = np.delete(X_pool, train_idx, axis=0)
    y_pool = np.delete(y_pool, train_idx)

    # initializing learner
    learner = ActiveLearner(estimator=RandomForestClassifier(n_estimators=10),
                            X_training=X_train,
                            y_training=y_train)
    learner_list.append(learner)

# assembling the committee
committee = Committee(learner_list=learner_list)

# query by committee
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = committee.query(X_pool)
    committee.teach(X=X_pool[query_idx].reshape(1, -1),
                    y=y_pool[query_idx].reshape(1, ))
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)
def active_learn(df1, first_item_index_of_each_category):
    train_idx = first_item_index_of_each_category
    # X_train = iris['data'][train_idx]
    # y_train = iris['target'][train_idx]

    # initial training data
    data = df1.values[:, 1:]
    target = df1['label'].values

    X_full = df1.values[:, 1:]
    y_full = df1['label'].values

    X_train = df1.values[:, 1:][
        train_idx]  #item from second column as the first column is the label..
    y_train = df1['label'].values[train_idx]

    # X_pool = np.delete(data, train_idx, axis=0)
    # y_pool = np.delete(target, train_idx)

    X_pool = deepcopy(X_full)
    y_pool = deepcopy(y_full)

    # initializing Committee members
    n_members = 2
    learner_list = list()

    for member_idx in range(n_members):
        # initial training data
        # n_initial = 5
        # train_idx = np.random.choice(range(X_pool.shape[0]), size=n_initial, replace=False)
        # X_train = X_pool[train_idx]
        # y_train = y_pool[train_idx]

        # creating a reduced copy of the data with the known instances removed
        X_pool = np.delete(X_pool, train_idx, axis=0)
        y_pool = np.delete(y_pool, train_idx)

        # initializing learner
        learner = ActiveLearner(estimator=RandomForestClassifier(),
                                X_training=X_train,
                                y_training=y_train)
        learner_list.append(learner)
        # assembling the committee
    committee = Committee(learner_list=learner_list)

    # print('Committee initial predictions, accuracy = %1.3f' % committee.score(data, target))
    print('%1.3f' % committee.score(data, target))

    performance_array = []
    n_queries = 505
    for idx in range(n_queries):
        query_idx, query_instance = committee.query(X_pool)
        committee.teach(X=X_pool[query_idx].reshape(1, -1),
                        y=y_pool[query_idx].reshape(1, ))
        # remove queried instance from pool
        X_pool = np.delete(X_pool, query_idx, axis=0)
        y_pool = np.delete(y_pool, query_idx)
        learner_score = committee.score(data, target)
        # print('Committee %d th query predictions, accuracy = %1.3f' % (idx , learner_score))
        print('%1.3f' % (learner_score))
        if (idx % 100 == 0):
            performance_array.append(learner_score)
    percentage_increase(performance_array)
Exemple #9
0
    def learn(self):
        # seeding
        classes = self.short_df['grades_round'].unique()
        seed_index = []
        for i in classes:
            seed_index.append(self.short_df['grades_round'][
                self.short_df['grades_round'] == i].index[0])
        seed_index

        act_data = self.short_df.copy()
        accuracy_list = []
        f1_total_list = []
        kappa_total_list = []

        # initialising
        train_idx = seed_index
        X_train = self.X[train_idx]
        y_train = self.Y[train_idx]

        # generating the pool
        X_pool = np.delete(self.X, train_idx, axis=0)
        y_pool = np.delete(self.Y, train_idx)

        act_data = act_data.drop(axis=0, index=train_idx)
        act_data.reset_index(drop=True, inplace=True)

        initiated_committee = []
        for learner_idx, model in enumerate(self.learners):
            learner = ActiveLearner(estimator=model,
                                    X_training=X_train,
                                    y_training=y_train)
            initiated_committee.append(learner)
        # Commitee creation
        committee = Committee(
            learner_list=initiated_committee,
            #             query_strategy=vote_entropy_sampling
        )

        committee.teach(X_train, y_train)

        # pool-based sampling
        n_queries = int(len(X) / (100 / self.percent))
        for idx in range(n_queries):
            query_idx = np.random.choice(range(len(X_pool)))
            committee.teach(X=X_pool[query_idx].reshape(1, -1),
                            y=y_pool[query_idx].reshape(1, ))

            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

            act_data = act_data.drop(axis=0, index=query_idx)
            act_data.reset_index(drop=True, inplace=True)

            accuracy_list.append(
                accuracy_score(committee.predict(X_pool), y_pool))

            model_pred = committee.predict(X_pool)
            f1_total_list.append(
                f1_score(y_pool,
                         model_pred,
                         average="weighted",
                         labels=np.unique(model_pred)))
            kappa_total_list.append(cohen_kappa_score(y_pool, model_pred))
        return accuracy_list, f1_total_list, kappa_total_list
    plt.show()

# visualizing the Committee's predictions per learner
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = committee.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Committee initial predictions')
    plt.show()

# query by committee
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = committee.query(X_pool)
    committee.teach(
        X=X_pool[query_idx].reshape(1, -1),
        y=y_pool[query_idx].reshape(1, )
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

# visualizing the final predictions per learner
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(n_members*7, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_members, learner_idx + 1)
        plt.scatter(x=pca[:, 0], y=pca[:, 1], c=learner.predict(iris['data']), cmap='viridis', s=50)
        plt.title('Learner no. %d predictions after %d queries' % (learner_idx + 1, n_queries))
    plt.show()

# visualizing the Committee's predictions
    for x in range(135):
        if (classifier_uncertainty(committee, X_unlab[iter].reshape(1, -1)) >=
                0.8):
            break
        iter = (iter + 1) % (X_unlab.shape[0])

    q_id = iter - 1

    X_new = X_unlab[q_id].reshape(1, -1)
    Y_new = Y_unlab[q_id].reshape(1, )

    X_unlab, Y_unlab = np.asarray(np.delete(X_unlab, q_id,
                                            axis=0)), np.delete(Y_unlab,
                                                                q_id,
                                                                axis=0)
    committee.teach(X_new, Y_new)

    x = committee.score(X, Y)
    accuracy_list.append(x)

    if ((i + 1) % 15 == 0):
        print("Accuracy after", i + 1, "iterations :", x)

print("Number of unlabelled data points left are : ", X_unlab.shape[0])

# In[70]:

plt.figure(figsize=(7, 5))
plt.plot(list(range(0,
                    int(queries) + 1)),
         accuracy_list,
committee_random = Committee(
    learner_list=[member1r, member2r, member3r, member4r, member5r])

# In[10]:

random_acc_list = [committee_random.score(X, Y)]

print("Accuracy after", 0, "iterations :", committee_random.score(X, Y))

random.seed(99)
for randompts in range(60):
    index = random.choice(unlabelled_idx)
    X_new = X[index].reshape(1, -1)
    Y_new = Y[index].reshape(1, )

    committee_random.teach(X_new, Y_new)
    random_x = committee_random.score(X, Y)
    random_acc_list.append(random_x)

    if ((randompts + 1) % 15 == 0):
        print("Accuracy after", randompts + 1, "iterations :", random_x)

# In[11]:

import math
unlab_length = X_unlab.shape[0]
disagreement = np.zeros(unlab_length * 2).reshape(unlab_length, 2)

for i in range(unlab_length):
    index = [i]
    predict = [-1, -1, -1, -1, -1]
Exemple #13
0
        X_initial, y_initial, X_pool, y_pool = init_data_sampling(xtrain, ytrain)

        # creating learners. if committee flag == True then create a committee with a subset with the returned learners.
        # else: use only the first learner
        learner1, learner2, learner3 = create_learners(qs, X_initial, y_initial)
        if committee_flag:
            learner = Committee(
                learner_list=[learner1, learner3],
                query_strategy=vote_entropy_sampling
            )
        else:
            learner = learner1

        # the active learning loop
        update_results(ytest, learner.predict(xtest), i)
        n_queries = 10
        for idx in range(n_queries):
            query_idx, query_instance = learner.query(X_pool, n_instances=300)
            learner.teach(
                X=X_pool[query_idx], y=y_pool[query_idx]
            )
            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx, axis=0)
            print(idx)
            update_results(ytest, learner.predict(xtest), i)
        plots(i)
        reset_tables()
    f1_plot()