Example #1
0
    def _ensemble_predictions(
            self, rf: ActiveLearner, lr: ActiveLearner,
            gb: GradientBoostingClassifier, iforest: IsolationForest,
            active_learning_data: ActiveLearningData) -> np.ndarray:
        x_dev = active_learning_data.x_dev
        threshold = sum(self.ensemble_weights.values()) / 2

        return np.vstack([
            rf.predict(x_dev) * self.ensemble_weights['rf'],
            lr.predict(x_dev) * self.ensemble_weights['lr'],
            (iforest.predict(x_dev) == -1) * self.ensemble_weights['iforest'],
            gb.predict(x_dev) * self.ensemble_weights['gb']
        ]).sum(axis=0) >= threshold
Example #2
0
    def learn(self):
        # seeding
        classes = self.short_df['grades_round'].unique()
        seed_index = []
        for i in classes:
            seed_index.append(self.short_df['grades_round'][
                self.short_df['grades_round'] == i].index[0])
        seed_index

        act_data = self.short_df.copy()
        accuracy_list = []
        f1_total_list = []
        kappa_total_list = []

        # initialising
        train_idx = seed_index
        X_train = self.X[train_idx]
        y_train = self.Y[train_idx]

        # generating the pool
        X_pool = np.delete(self.X, train_idx, axis=0)
        y_pool = np.delete(self.Y, train_idx)

        act_data = act_data.drop(axis=0, index=train_idx)
        act_data.reset_index(drop=True, inplace=True)

        # initializing the active learner

        learner = ActiveLearner(estimator=self.model,
                                X_training=X_train,
                                y_training=y_train,
                                query_strategy=self.query_method)

        # pool-based sampling
        n_queries = int(len(X) / (100 / self.percent))
        for idx in range(n_queries):
            query_idx, query_instance = learner.query(X_pool)
            learner.teach(X=X_pool[query_idx].reshape(1, -1),
                          y=y_pool[query_idx].reshape(1, ))

            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

            act_data = act_data.drop(axis=0, index=query_idx)
            act_data.reset_index(drop=True, inplace=True)

            accuracy_list.append(learner.score(X_pool, y_pool))
            model_pred = learner.predict(X_pool)
            f1_total_list.append(
                f1_score(y_pool,
                         model_pred,
                         average="weighted",
                         labels=np.unique(model_pred)))
            kappa_total_list.append(cohen_kappa_score(y_pool, model_pred))


#             print('Accuracy after query no. %d: %f' % (idx+1, learner.score(X_pool, y_pool)))
# print("By just labelling ",round(n_queries*100.0/len(X),2),"% of total data accuracy of ", round(learner.score(X_pool, y_pool),3), " % is achieved on the unseen data" )
        return accuracy_list, f1_total_list, kappa_total_list
def _SVM_loss(multiclass_classifier: ActiveLearner,
              X: modALinput,
              most_certain_classes: Optional[int] = None) -> np.ndarray:
    """
    Utility function for max_loss and mean_max_loss strategies.

    Args:
        multiclass_classifier: sklearn.multiclass.OneVsRestClassifier instance for which the loss
            is to be calculated.
        X: The pool of samples to query from.
        most_certain_classes: optional, indexes of most certainly predicted class for each instance.
            If None, loss is calculated for all classes.

    Returns:
        np.ndarray of shape (n_instances, ), losses for the instances in X.

    """
    predictions = 2 * multiclass_classifier.predict(X) - 1
    n_classes = len(multiclass_classifier.classes_)

    if most_certain_classes is None:
        cls_mtx = 2 * np.eye(n_classes, n_classes) - 1
        loss_mtx = np.maximum(1 - np.dot(predictions, cls_mtx), 0)
        return loss_mtx.mean(axis=1)
    else:
        cls_mtx = -np.ones(shape=(len(X), n_classes))
        for inst_idx, most_certain_class in enumerate(most_certain_classes):
            cls_mtx[inst_idx, most_certain_class] = 1

        cls_loss = np.maximum(1 - np.multiply(cls_mtx, predictions),
                              0).sum(axis=1)
        return cls_loss
Example #4
0
def al_Loop(estimator, X_train, Y_train, X, Y, X_test, Y_test, indexs):
    learner = ActiveLearner(estimator=estimator,
                            X_training=X_train,
                            y_training=Y_train)
    X_pool = np.delete(X, indexs, axis=0)
    Y_pool = np.delete(Y, indexs, axis=0)
    index = 0

    accuracy = 0
    while len(X_pool) > 0:
        query_index, _ = learner.query(X_pool)
        x, y = X_pool[query_index].reshape(1, -1), Y_pool[query_index].reshape(
            1, )
        learner.teach(X=x, y=y)
        X_pool, Y_pool = np.delete(X_pool, query_index,
                                   axis=0), np.delete(Y_pool, query_index)
        model_accuracy = 1 - learner.score(X_pool, Y_pool)

        print('Error after query {n}: {acc:0.4f}'.format(n=index + 1,
                                                         acc=model_accuracy))
        accuracy = model_accuracy
        predicts = learner.predict(X_test)
        corrects = (predicts == Y_test)
        accs = (sum([1 if i else 0 for i in corrects]) / len(predicts))
        accs = 1 - accs
        print(accs)
        index += 1
    return learner
    def al_pool(self, data, target, X_train, y_train, X_full, y_full, train_idx):
        acc = []
        X_pool = np.delete(data, train_idx, axis=0)
        y_pool = np.delete(target, train_idx)
        learner = ActiveLearner(
            estimator=RandomForestClassifier(),
            X_training=X_train, y_training=y_train
        )

        n_queries = self.query_number
        # n_queries = 1500
        for idx in range(n_queries):
            query_idx, query_instance = learner.query(X_pool)
            learner.teach(
                X=X_pool[query_idx].reshape(1, -1),
                y=y_pool[query_idx].reshape(1, )
            )
            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)
            learner_score = learner.score(data, target)

            # learner.estimator
            # print('Accuracy after query no. %d: %f' % (idx + 1, learner_wscore))
            X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.30)
            y_predict = learner.predict(X_test)

            precision, recall, fscore, support = self.performance_measure(learner, X_full, y_full)
            acc.append(learner_score)
            print('%0.3f' % (learner_score), end=",")
        return acc
Example #6
0
def gaussian_process_max_std(regressor: ActiveLearner,
                             X: np.ndarray,
                             batch_size: int = 10):
    _, std = regressor.predict(X, return_std=True)
    idxs = np.argsort(std)[::-1]
    idxs = idxs[:batch_size]
    return idxs, X[idxs]
Example #7
0
    def learn(self):
        # seeding
        classes = self.short_df['grades_round'].unique()
        seed_index = []
        for i in classes:
            seed_index.append(self.short_df['grades_round'][
                self.short_df['grades_round'] == i].index[0])
        seed_index

        act_data = self.short_df.copy()
        accuracy_list = []
        f1_total_list = []
        kappa_total_list = []

        # initialising
        train_idx = seed_index
        X_train = self.X[train_idx]
        y_train = self.Y[train_idx]

        # generating the pool
        X_pool = np.delete(self.X, train_idx, axis=0)
        y_pool = np.delete(self.Y, train_idx)

        act_data = act_data.drop(axis=0, index=train_idx)
        act_data.reset_index(drop=True, inplace=True)

        # initializing the random learner
        learner = ActiveLearner(
            estimator=self.model,
            X_training=X_train,
            y_training=y_train,
        )

        # pool-based sampling
        n_queries = int(len(X) / (100 / self.percent))
        for idx in range(n_queries):
            query_idx = np.random.choice(range(len(X_pool)))
            learner.teach(X=X_pool[query_idx].reshape(1, -1),
                          y=y_pool[query_idx].reshape(1, ))

            # remove queried instance from pool
            X_pool = np.delete(X_pool, query_idx, axis=0)
            y_pool = np.delete(y_pool, query_idx)

            act_data = act_data.drop(axis=0, index=query_idx)
            act_data.reset_index(drop=True, inplace=True)

            accuracy_list.append(learner.score(X_pool, y_pool))

            model_pred = learner.predict(X_pool)
            f1_total_list.append(
                f1_score(y_pool,
                         model_pred,
                         average="weighted",
                         labels=np.unique(model_pred)))
            kappa_total_list.append(cohen_kappa_score(y_pool, model_pred))
        return accuracy_list, f1_total_list, kappa_total_list
def run_model(X, y, test_size, rep_times, n_queries, estimator, fd):
    performance_history = [[] for i in range(n_queries)]
    for i in range(rep_times):
        # print('exp:', i)
        # print('exp:', i, file=fd)

        n_labled_examples = X.shape[0]
        X_trn_all, X_tst, y_trn_all, y_tst = train_test_split(
            X, y, test_size=test_size, stratify=y)
        X_trn_all = X_trn_all[:, 1:]
        y_tst = X_tst[:, 0]
        X_tst = X_tst[:, 1:]
        y_tst = y_tst.astype('int32')

        X_trn_min, y_trn_min, X_trn, y_trn = get_init_train(
            X_trn_all, y_trn_all)
        # print('ground truth:', y_tst, file=fd)

        learner = ActiveLearner(estimator=estimator,
                                X_training=X_trn_min,
                                y_training=y_trn_min)

        # prediction with no query
        predictions_0 = learner.predict(X_tst)
        err_0 = error_calculation(predictions_0, y_tst)

        for j in range(n_queries):
            query_index, query_instance = learner.query(X_trn)
            X_qry, y_qry = X_trn[query_index].reshape(
                1, -1), y_trn[query_index].reshape(1, )
            learner.teach(X=X_qry, y=y_qry)
            X_trn, y_trn = np.delete(X_trn, query_index,
                                     axis=0), np.delete(y_trn, query_index)
            predictions = learner.predict(X_tst)
            err = error_calculation(predictions, y_tst)
            performance_history[j].append(err)

    avg_err = []
    sd = []
    for i in range(n_queries):
        avg_err.append(np.mean(performance_history[i]))
        sd.append(np.std(performance_history[i]) / np.sqrt(rep_times))

    return avg_err, sd
def run(X_initial, y_initial, n_samples_for_initial, n_queries, estimator):
    np.random.seed(0)
    start_time = time.time()
    # Isolate our examples for our labeled dataset.
    n_labeled_examples = X_initial.shape[0]
    training_indices = np.random.randint(low=0, high=n_labeled_examples + 1, size=n_samples_for_initial)

    X_train = X_initial[training_indices, :]
    y_train = y_initial[training_indices]

    # Isolate the non-training examples we'll be querying.
    X_pool = delete_rows_csr(X_initial, training_indices)
    y_pool = np.delete(y_initial, training_indices)
    # Pre-set our batch sampling to retrieve 3 samples at a time.
    BATCH_SIZE = 3
    preset_batch = partial(uncertainty_batch_sampling, n_instances=BATCH_SIZE)

    # Specify our active learning model.
    learner = ActiveLearner(
        estimator=estimator,
        X_training=X_train,
        y_training=y_train,
        query_strategy=preset_batch
    )

    initial_accuracy = learner.score(X_initial, y_initial)
    print("Initial Accuracy: ", initial_accuracy)
    performance_history = [initial_accuracy]

    f1_score = 0
    index = 0
    while f1_score < 0.65:
        index += 1
        query_index = np.random.choice(y_pool.shape[0], size=1, replace=False)

        # Teach our ActiveLearner model the random record it has been sampled.
        X, y = X_pool[query_index, :], y_pool[query_index]
        learner.teach(X=X, y=y)

        # Remove the queried instance from the unlabeled pool.
        X_pool = delete_rows_csr(X_pool, query_index)
        y_pool = np.delete(y_pool, query_index)

        # Calculate and report our model's f1_score.
        y_pred = learner.predict(X_initial)
        f1_score = metrics.f1_score(y_initial, y_pred, average='micro')

        if index % 100 == 0:
            print('F1 score after {n} training samples: {f1:0.4f}'.format(n=index, f1=f1_score))

        # Save our model's performance for plotting.
        performance_history.append(f1_score)

    print("--- %s seconds ---" % (time.time() - start_time))
    return index
def run_model(X, y, test_size, rep_times, n_queries, estimator, fd):
    performance_history = [[] for i in range(n_queries)]
    for i in range(rep_times):
        print('exp:', i)
        # print('exp:', i, file=fd)
        
        n_labled_examples = X.shape[0]
        X_trn_all, X_tst, y_trn_all, y_tst = train_test_split(X, y, test_size=test_size, stratify=y)
        # get initial training set, which size = n_class
        X_trn_min, y_trn_min, X_trn, y_trn = get_init_train(X_trn_all, y_trn_all)
        # print('ground truth:', y_tst, file=f_2)

        learner = ActiveLearner(estimator=estimator, X_training=X_trn_min, y_training=y_trn_min)

        # prediction with no query
        predictions_0 = learner.predict(X_tst)
        err_0 = error_calculation(predictions_0, y_tst)

        # print('query no.', 0, file=f_2)
        # print('predictions:', predictions_0, file=f_2)
        # print('MSE:', err_0, file=f_2)

        for j in range(n_queries):
            query_index, query_instance = learner.query(X_trn)
            X_qry, y_qry = X_trn[query_index].reshape(1, -1), y_trn[query_index].reshape(1, )
            learner.teach(X=X_qry, y=y_qry)
            X_trn, y_trn = np.delete(X_trn, query_index, axis=0), np.delete(y_trn, query_index)
            predictions = learner.predict(X_tst)
            err = error_calculation(predictions, y_tst)
            # print('query no.', j+1, file=f_2)
            # print('predictions:', predictions, file=f_2)
            # print('MSE:', err, file=f_2)
            performance_history[j].append(err)

    avg_err = []
    for i in range(n_queries):
        avg_err.append(np.mean(performance_history[i]))

    return avg_err
def active_learner(query_stra, N_query):
  knn = KNeighborsClassifier(n_neighbors=8)
  learner = ActiveLearner(estimator=knn, X_training=X_train, y_training=y_train, query_strategy=query_stra)

  predictions = learner.predict(X_test)

  X_pool = X_test.values
  y_pool = y_test.values

  for index in range(N_query):
    query_index, query_instance = learner.query(X_pool)
    X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    learner.teach(X=X, y=y)
    X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)
    
  model_accuracy = learner.score(X_test, y_test)
  print('Accuracy: {acc:0.4f} \n'.format(acc=model_accuracy))
  performance_history.append(model_accuracy)
Example #12
0
    def _active_learning_update_metrics(
            self, active_learner: ActiveLearner, x_dev: np.ndarray,
            y_dev: Series, stats: Stats, data_for_plotting: List[Stats],
            i: int, elapsed_train: float, elapsed_query: float,
            labeled_indices: List[int],
            semi_sup: bool) -> Tuple[Stats, List[Stats], List[int]]:
        predicted = active_learner.predict(x_dev)
        scores = None if semi_sup else active_learner.predict_proba(x_dev)[:,
                                                                           1]
        metrics = self._get_metrics(actual=y_dev,
                                    predicted=predicted,
                                    scores=scores)

        data_for_plotting.append(
            self._get_plotting_row(i, metrics, elapsed_train, elapsed_query))
        metrics = util.add_prefix_to_dict_keys(metrics, f'sample_{i+1}_')
        if i + 1 in self.active_learning_log_intervals or i == -1:
            stats = util.merge_dicts(stats, metrics)
        return stats, data_for_plotting, labeled_indices
Example #13
0
y_train = iris['target'][train_idx]

# generating the pool
X_pool = np.delete(iris['data'], train_idx, axis=0)
y_pool = np.delete(iris['target'], train_idx)

# initializing the active learner
learner = ActiveLearner(
    predictor=KNeighborsClassifier(n_neighbors=3),
    X_initial=X_train, y_initial=y_train
)

# visualizing initial prediction
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Initial accuracy: %f' % learner.score(iris['data'], iris['target']))
    plt.show()

print('Accuracy before active learning: %f' % learner.score(iris['data'], iris['target']))

# pool-based sampling
n_queries = 20
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool)
    learner.teach(
        X=X_pool[query_idx].reshape(1, -1),
        y=y_pool[query_idx].reshape(1, )
    )
    # remove queried instance from pool
Example #14
0
def active_learning(data,
                    n_queries,
                    y_column,
                    estimator=RandomForestClassifier(),
                    limit_cols=None,
                    mode=paths.dataset_version):
    line = False
    if y_column in [
            'Marginal', 'Heading'
    ]:  # covers marginal_lines, heading_id_toc, heading_id_intext
        line = True  # determines if a line or page is to to be displayed
    classes = pd.unique(data[y_column].values)  #todo: check type
    classes = sorted(filter(lambda v: v == v, classes))
    X_initial, Y_initial, X_pool, y_pool, refs = al_data_prep(
        data, y_column, limit_cols, mode)
    if mode == paths.production:
        test_percentage = 0
    else:
        test_percentage = 0.2
    if 'lstm' in estimator.named_steps:
        test_size = int(X_initial.shape[0] * test_percentage)
        X_train, y_train = X_initial[:-test_size], Y_initial[:-test_size]
        X_test, y_test = X_initial[-test_size:], Y_initial[-test_size:]
    else:
        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X_initial, Y_initial, test_size=test_percentage)
    learner = ActiveLearner(
        estimator=estimator,  #ensemble.RandomForestClassifier(),
        query_strategy=uncertainty_sampling,
        X_training=X_train.values,
        y_training=y_train.astype(int))
    accuracy_scores = [learner.score(X_test, y_test.astype(int))]
    if 'boreholes' not in mode:
        query_idx, query_inst = learner.query(X_pool, n_instances=n_queries)
        query_idx = np.asarray([refs['idx'][i] for i in query_idx])
    else:
        query_idx, query_inst = borehole_sample(X_pool, n_queries)
    y_new = np.zeros(n_queries, dtype=int)
    time.sleep(5)
    for i in range(n_queries):
        idx = query_idx[i]
        #page=int(query_inst[i][0])
        if 'boreholes' not in mode:
            page = refs['pagenums'].loc[idx]
        if line:
            line = refs['linenums'].loc[idx]
        if 'boreholes' in mode:
            page = refs['Tables'].loc[idx]
        y = al_input_loop(learner,
                          query_inst[i],
                          refs['docids'].loc[idx],
                          n_queries,
                          classes,
                          page=page,
                          line=line,
                          mode=mode)
        y_new[i] = y
        #print("index: ", idx)
        #print("x: ", data.at[idx, 'Columns'])
        data.at[idx, y_column] = y  # save value to copy of data
        data.at[idx, 'TagMethod'] = 'manual'

    learner.teach(query_inst, y_new)  # reshape 1, -1
    accuracy_scores.append(learner.score(X_test, y_test.astype(int)))
    preds = learner.predict(X_test)
    #print("End of annotation. Samples, predictions, annotations: ")
    #print(ref_docids.iloc[query_idx].values,
    #      np.concatenate((query_inst, np.array([preds]).T, y_new.reshape(-1, 1)), axis=1))
    print(sklearn.metrics.confusion_matrix(preds, y_test.astype(int)))
    accuracy = accuracy_scores[-1]
    print(accuracy)
    return data, accuracy, learner
Example #15
0
def active_learning(vectorizer_method, X_train, y_train, X_test, y_test, orig_df, X_new, model, qstrategy, n_queries, model_filename, df_filename):

    classifier = None
    strategy = None

    if model == 'LR':
        classifier = LogisticRegression()
    elif model == 'NB':
        classifier = MultinomialNB()
    elif model == 'SVM':
        classifier = SVC(kernel='linear', probability=True)
    elif model == 'RF':
        classifier = RandomForestClassifier()

    if qstrategy == 'CE':
        strategy = classifier_entropy
    elif qstrategy == 'CM':
        strategy = classifier_margin
    elif qstrategy == 'CU':
        strategy = classifier_uncertainty
    elif qstrategy == 'ES':
        strategy = entropy_sampling
    elif qstrategy == 'MS':
        strategy = margin_sampling
    elif qstrategy == 'US':
        strategy = uncertainty_sampling

    learner = ActiveLearner(
                 estimator=classifier,
                 query_strategy=strategy,
                 X_training=X_train, y_training=y_train
             )

    accuracy_scores = [learner.score(X_test, y_test)]
    recall_scores = [recall_score(y_test, learner.predict(X_test))]

    for i in range(n_queries):
        #print(X_train.shape)
        #print(X_new.shape)
        #print(orig_df.iloc[0])
        query_idx, query_inst = learner.query(X_new)
        #print(query_inst)
        #print(query_idx)
        print(orig_df['text'].iloc[query_idx[0]])
        print("Is this a data reuse statement or not (1=yes, 0=no)?")
        try:
            y_new = np.array([int(input())], dtype=int)
            if y_new in [0,1]:
                orig_df.loc[query_idx[0], 'data_reuse'] = y_new
                learner.teach(query_inst.reshape(1, -1), y_new)

                X_new = csr_matrix(np.delete(X_new.toarray(), query_idx, axis=0))

                accuracy_scores.append(learner.score(X_test, y_test))
                recall_scores.append(recall_score(y_test, learner.predict(X_test)))
                #print(accuracy_scores)
                #print(recall_scores)
                print()
            else:
                print("Input not accepted. Type '1' for yes or '0' for no. Skipping.")
                print()
        except:
            print("Encountered Error: " + str(sys.exc_info()))
            print()
            return

    # Performance of classier
    with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 5))
        plt.title('Performance of the classifier during the active learning')
        #plt.plot(range(n_queries+1), accuracy_scores)
        #plt.scatter(range(n_queries+1), accuracy_scores)
        plt.plot(range(n_queries+1), recall_scores)
        plt.scatter(range(n_queries+1), recall_scores)
        plt.xlabel('Number of queries')
        plt.ylabel('Performance')
        plt.savefig('/Users/G/Loyola/Spring2020/DS796/active_model_' + vectorizer_method + '_' + model + '_performance.png')
        print("Graph saved: /Users/G/Loyola/Spring2020/DS796/active_model_" + vectorizer_method + '_' + model + "_performance.png")
        print()
        #plt.show()
        plt.close()

    fd = open(model_filename, 'wb')
    pickle.dump(learner, fd)
    fd.close()
    print("Model saved: ", model_filename)
    print()

    orig_df.to_csv(df_filename, index=False)
    print("Dataframe saved: ", df_filename)
    print()
Example #16
0
for _ in range(n_learners):
    learner = ActiveLearner(estimator=KNeighborsClassifier(n_neighbors=10),
                            X_training=X_pool[initial_idx],
                            y_training=y_pool[initial_idx],
                            bootstrap_init=True)
    learner_list.append(learner)

# assembling the Committee
committee = Committee(learner_list)

# visualizing every learner in the Committee
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7 * n_learners, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_learners, learner_idx + 1)
        plt.imshow(learner.predict(X_pool).reshape(im_height, im_width))
        plt.title('Learner no. %d' % (learner_idx + 1))
    plt.show()

# visualizing the Committee's predictions per learner
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    plt.imshow(committee.predict(X_pool).reshape(im_height, im_width))
    plt.title('Committee consensus predictions')
    plt.show()

# rebagging the data
committee.rebag()

# visualizing the learners in the retrained Committee
with plt.style.context('seaborn-white'):
Example #17
0
    learner = ActiveLearner(
        predictor=KNeighborsClassifier(n_neighbors=10),
        X_initial=X_pool[initial_idx], y_initial=y_pool[initial_idx],
        bootstrap_init=True
    )
    learner_list.append(learner)

# assembling the Committee
committee = Committee(learner_list)

# visualizing every learner in the Committee
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7*n_learners, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_learners, learner_idx+1)
        plt.imshow(learner.predict(X_pool).reshape(im_height, im_width))
        plt.title('Learner no. %d' % (learner_idx + 1))
    plt.show()

# visualizing the Committee's predictions per learner
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    plt.imshow(committee.predict(X_pool).reshape(im_height, im_width))
    plt.title('Committee consensus predictions')
    plt.show()

# rebagging the data
committee.rebag()

# visualizing the learners in the retrained Committee
with plt.style.context('seaborn-white'):
Example #18
0
def activeLearning(method, X_train, Y_train, X_test, Y_test, K):

    interations = 101
    random.seed(0)

    # Define initial labels indexs to train classifier
    if method in ["RDS", "MST-BE"]:
        idx, root_idx, X_initial, Y_initial, X_pool, Y_pool = activeLearningLib_Object.get_samples(
            X_train,
            Y_train,
            n_clusters=int(len(np.unique(Y_train)) * 2),
            strategy=method)
        labeled_idx = np.empty(0, int)
    else:
        idx = np.asarray(random.sample(range(0, len(X_train)), k=K))
        X_initial, Y_initial = X_train[idx], Y_train[idx]
        X_pool, Y_pool = np.delete(X_train, idx, axis=0), np.delete(Y_train,
                                                                    idx,
                                                                    axis=0)

    # Initialize Active Learning Methods
    t = time.time()
    if method == "Entropy Sampling":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=entropy_sampling,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "Margin Sampling":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=margin_sampling,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "Uncertainty Sampling":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=uncertainty_sampling,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "Average Confidence":
        learner = ActiveLearner(estimator=SVC(probability=True),
                                query_strategy=avg_confidence,
                                X_training=X_initial,
                                y_training=Y_initial)
    elif method == "RDS":
        learner = ActiveLearner(
            estimator=SVC(probability=True),
            # estimator = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None),
            query_strategy=root_distance_based_selection_strategy,
            X_training=X_initial,
            y_training=Y_initial)
    elif method == "MST-BE":
        learner = ActiveLearner(
            estimator=SVC(probability=True),
            # estimator = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None),
            query_strategy=disagree_labels_edges_idx_query_strategy,
            X_training=X_initial,
            y_training=Y_initial)
    timeToTrain = time.time() - t

    results = []

    labeledData_X = X_initial
    labeledData_Y = Y_initial

    for run in range(interations):

        if K > len(idx): break

        if method in ["RDS", "MST-BE"]:

            kwargs = dict()
            if K > len(idx): break
            kwargs = dict(idx=idx, labeled_idx=labeled_idx, y_root=Y_initial)

            t = time.time()
            query_idx, idx = learner.query(X_pool, n_instances=K, **kwargs)
            timeToSelect = time.time() - t

            if query_idx is None or len(query_idx) < K: break
            labeled_idx = np.append(labeled_idx, query_idx)

            predsCorrecteds = learner.predict(X_pool[query_idx])
            counter = 0
            for (x, y) in zip(predsCorrecteds, Y_pool[query_idx].flatten()):
                if x != y:
                    counter += 1

            t = time.time()
            learner.teach(X=X_pool[query_idx], y=Y_pool[query_idx])
            timeToTrain = time.time() - t

            labeledData_X = np.vstack((labeledData_X, X_pool[query_idx]))
            labeledData_Y = np.vstack((labeledData_Y, Y_pool[query_idx]))
            t = time.time()
            # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None)
            # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int"))
            preds = learner.predict(X_test.values)
            timeToTest = time.time() - t

            acc = accuracy_score(Y_test, preds)
            f1score = f1_score(Y_test, preds, average='macro')
            precision = precision_score(Y_test, preds, average='macro')
            recall = recall_score(Y_test, preds, average='macro')
            knowClasses = len(set(preds.tolist()))

            print("Run {}: Acc: {}".format(run + 1, acc))
            print("Know Classes: {}".format(knowClasses))
            print("Corrected Labels: {}".format(counter))
            print("Time to Select: {}".format(timeToSelect))
        else:
            if run == 0:

                t = time.time()
                # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None)
                # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int"))
                preds = learner.predict(X_test.values)
                timeToTest = time.time() - t

                acc = accuracy_score(Y_test, preds)
                f1score = f1_score(Y_test, preds, average='macro')
                precision = precision_score(Y_test, preds, average='macro')
                recall = recall_score(Y_test, preds, average='macro')
                knowClasses = len(set(preds.tolist()))
                counter = len(Y_initial)
                timeToSelect = 0

                print("Run {}: Acc: {}".format(run + 1, acc))
                print("Know Classes: {}".format(knowClasses))
                print("Corrected Labels: {}".format(counter))
                print("Time to Select: {}".format(timeToSelect))
            else:
                try:
                    t = time.time()
                    query_idx, idx = learner.query(X_pool, n_instances=K)
                    timeToSelect = time.time() - t
                except:
                    timeToSelect = 0
                    print("deu erro")
                    break

                predsCorrecteds = learner.predict(X_pool[query_idx])
                counter = 0
                for (x, y) in zip(predsCorrecteds,
                                  Y_pool[query_idx].flatten()):
                    if x != y:
                        counter += 1

                t = time.time()
                learner.teach(X=X_pool[query_idx], y=Y_pool[query_idx])
                # X_pool, Y_pool = np.delete(X_pool, query_idx, axis=0), np.delete(Y_pool, query_idx, axis=0)
                timeToTrain = time.time() - t

                # t = time.time()
                # preds = learner.predict(X_test)
                # timeToTest = time.time() - t

                labeledData_X = np.vstack((labeledData_X, X_pool[query_idx]))
                labeledData_Y = np.vstack((labeledData_Y, Y_pool[query_idx]))
                t = time.time()
                # model = SupervisedOPF(distance = "log_squared_euclidean", pre_computed_distance = None)
                # trained_model = model.fit(labeledData_X, labeledData_Y.flatten().astype("int"))
                preds = learner.predict(X_test.values)
                X_pool, Y_pool = np.delete(X_pool, query_idx,
                                           axis=0), np.delete(Y_pool,
                                                              query_idx,
                                                              axis=0)
                timeToTest = time.time() - t

                acc = accuracy_score(Y_test, preds)
                f1score = f1_score(Y_test, preds, average='macro')
                precision = precision_score(Y_test, preds, average='macro')
                recall = recall_score(Y_test, preds, average='macro')
                knowClasses = len(set(preds.tolist()))

                print("Run {}: Acc: {}".format(run + 1, acc))
                print("Know Classes: {}".format(knowClasses))
                print("Corrected Labels: {}".format(counter))
                print("Time to Select: {}".format(timeToSelect))

        results.append([
            run + 1, K,
            np.round(timeToTrain, 2),
            np.round(timeToTest, 2),
            np.round(timeToSelect, 2),
            np.round(acc * 100, 2),
            np.round(f1score * 100, 2),
            np.round(precision * 100, 2),
            np.round(recall * 100, 2), knowClasses, counter
        ])

    results_df = pd.DataFrame(results,
                              columns=[
                                  "iteration", "k-value", "time-to-train",
                                  "time-to-test", "time-to-select", "accuracy",
                                  "f1-score", "precision", "recall",
                                  "knowClasses", "correctedLabels"
                              ])

    return results_df
Example #19
0
X_train = data[train_idx]
y_train = target[train_idx]

# generating the pool
X_pool = np.delete(data, train_idx, axis=0)
y_pool = np.delete(target, train_idx)

# initializing the active learner
learner = ActiveLearner(estimator=RandomForestClassifier(),
                        X_training=X_train,
                        y_training=y_train)

# visualizing initial prediction
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict(data)
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Initial accuracy: %f' % learner.score(data, target))
    plt.show()

print('Accuracy before active learning: %f' % learner.score(data, target))

# pool-based sampling
n_queries = 30
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool)
    learner.teach(X=X_pool[query_idx].reshape(1, -1),
                  y=y_pool[query_idx].reshape(1, ))
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)
Example #20
0
    # initializing learner
    learner = ActiveLearner(
        predictor=RandomForestClassifier(),
        X_initial=X_train, y_initial=y_train
    )
    learner_list.append(learner)

# assembling the committee
committee = Committee(learner_list=learner_list)

# visualizing the initial predictions
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(n_members*7, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_members, learner_idx + 1)
        plt.scatter(x=pca[:, 0], y=pca[:, 1], c=learner.predict(iris['data']), cmap='viridis', s=50)
        plt.title('Learner no. %d initial predictions' % (learner_idx + 1))
    plt.show()

# visualizing the Committee's predictions per learner
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = committee.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Committee initial predictions')
    plt.show()

# query by committee
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = committee.query(X_pool)
Example #21
0
class SentenceClassifier:
    def __init__(self, PATH):
        self.user_data_path = PATH + '/data/text/user-classifier-data.txt'
        self.synthetic_data_path = PATH + '/data/text/synthetic-classifier-data.txt'
        self.setup_model()

    def setup_model(self):
        '''
        Define active learner and train
        with synthetic + stored examples
        '''
        # Read in training data
        with open(self.user_data_path, encoding='utf-8') as f:
            data = f.read().split('\n')

        with open(self.synthetic_data_path, encoding='utf-8') as f:
            data += f.read().split('\n')

        # Remove duplicates
        data = set(data)

        # Setup vectorizer and prepare training data
        self.vectorizer = CountVectorizer()
        self.X, self.y = [], []
        for row in data:
            row = row.split('\t')
            if len(row) == 2:
                self.X.append(row[0].strip())
                self.y.append(int(row[1]))
        self.X = self.vectorizer.fit_transform(self.X)

        self.learner = ActiveLearner(estimator=RandomForestClassifier(),
                                     query_strategy=uncertainty_sampling,
                                     X_training=self.X,
                                     y_training=self.y)

    def get_target_sentences(self, text, annotations):
        '''
        Return sentences that contain
        a prescription
        '''
        sentences = self.text_to_sentences(text)

        target_sentences = []
        for sentence in sentences:
            classification = self.learner.predict(
                self.vectorizer.transform([sentence]))
            if classification[0] == 1 and self.convert_to_export_format(
                    sentence) not in annotations:
                target_sentences.append(sentence)
        return target_sentences

    def convert_to_export_format(self, sentence):
        return '-'.join(sentence.split(' '))

    def text_to_sentences(self, text):
        '''
        Convert body of text into individual sentences
        '''
        sentences = re.split(delimiters, text)
        sentences = map(self.clean_sentence, sentences)
        return list(filter(self.is_valid_sentence, sentences))

    def clean_sentence(self, sentence):
        return sentence.strip()

    def is_valid_sentence(self, sentence):
        return sentence != '' and sentence not in stopwords and len(
            sentence.split(' ')) >= 3

    def teach(self, sentence, label):
        '''
        Save training data and update model
        '''
        # Store local data
        sentence = sentence.lower().strip()
        with open(self.user_data_path, 'a', encoding='utf-8') as f:
            f.write(sentence + '\t' + str(label) + '\n')

        # Setup learner with new data (to-do: train incrementally)
        self.setup_model()
Example #22
0
# defining the kernel for the Gaussian process
kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# initializing the active learner
regressor = ActiveLearner(estimator=GaussianProcessRegressor(kernel=kernel),
                          query_strategy=GP_regression_std,
                          X_training=X_initial.reshape(-1, 1),
                          y_training=y_initial.reshape(-1, 1))

# plotting the initial estimation
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
    x = np.linspace(0, 20, 1000)
    pred, std = regressor.predict(x.reshape(-1, 1), return_std=True)
    plt.plot(x, pred)
    plt.fill_between(x,
                     pred.reshape(-1, ) - std,
                     pred.reshape(-1, ) + std,
                     alpha=0.2)
    plt.scatter(X, y, c='k')
    plt.title('Initial estimation based on %d points' % n_initial)
    plt.show()

# active learning
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
Example #23
0
X_train = iris['data'][train_idx]
y_train = iris['target'][train_idx]

# generating the pool
X_pool = np.delete(iris['data'], train_idx, axis=0)
y_pool = np.delete(iris['target'], train_idx)

# initializing the active learner
learner = ActiveLearner(predictor=KNeighborsClassifier(n_neighbors=3),
                        X_initial=X_train,
                        y_initial=y_train)

# visualizing initial prediction
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Initial accuracy: %f' %
              learner.score(iris['data'], iris['target']))
    plt.show()

print('Accuracy before active learning: %f' %
      learner.score(iris['data'], iris['target']))

# pool-based sampling
n_queries = 20
for idx in range(n_queries):
    query_idx, query_instance = learner.query(X_pool)
    learner.teach(X=X_pool[query_idx].reshape(1, -1),
                  y=y_pool[query_idx].reshape(1, ))
    # remove queried instance from pool
                        query_strategy=entropy_sampling,
                        X_training=train_set_x_mean_vectors,
                        y_training=np.asarray(train_set_y))

for i in range(queries_number):
    print('\n\n', i + 1, 'from', queries_number)
    print_classes()

    query_idx, query_inst = learner.query(pool_x_mean_vectors)
    message = pool_x[int(query_idx)]
    print('MESSAGE:', utils.regex_preprocessing(message))

    new_label = np.array([utils.get_new_label_from_user()], dtype=int)
    new_data_set.append({
        'message': pool_x[int(query_idx)],
        'purpose': encoder.inverse_transform(new_label)[0]
    })
    learner.teach(query_inst, new_label)

    pool_x_mean_vectors = np.delete(pool_x_mean_vectors, query_idx, axis=0)
    pool_x = np.delete(pool_x, query_idx, axis=0)

predictions = learner.predict(pool_x_mean_vectors)

predicted_set = [{
    'message': pool_x[i],
    'purpose': predictions[i]
} for i in range(len(pool_x))]
predicted_set += new_data_set
data_set = utils.write_to_csv(predicted_set, eclipse_output_file)
x_new = x[training_indices]
y_new = y[training_indices]

# Isolate the non-training examples we'll be querying.
x_pool = np.delete(x, training_indices, axis=0)
y_pool = np.delete(y, training_indices, axis=0)

#'''

classifier1 = RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=50)
classifier2 = KNeighborsClassifier(n_neighbors=3)
learner = ActiveLearner(estimator=classifier1,
                        X_training=x_train,
                        y_training=y_train)

predictions = learner.predict(x)
is_correct = (predictions == y)
unqueried_score = learner.score(x, y)
print('Accuracy after first 1000 random rows: {acc:0.4f}%'.format(
    acc=unqueried_score * 100))
performance_history = [unqueried_score]

count = 1
while (float(performance_history[-1] * 100) < 90):
    queryList = []
    query_index, query_instance = learner.query(x_pool, n_instances=1000)
    training_indices = np.concatenate([training_indices, query_index])
    x_temp, y_temp = x_pool[query_index], y_pool[query_index]
    x_new = np.concatenate([x_new, x_temp])
    y_new = np.concatenate([y_new, y_temp])
    learner.teach(X=x_temp, y=y_temp)
Example #26
0
# defining the kernel for the Gaussian process
kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

# initializing the active learner
regressor = ActiveLearner(
    predictor=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_initial=X_initial.reshape(-1, 1), y_initial=y_initial.reshape(-1, 1)
)

# plotting the initial estimation
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
    x = np.linspace(0, 20, 1000)
    pred, std = regressor.predict(x.reshape(-1,1), return_std=True)
    plt.plot(x, pred)
    plt.fill_between(x, pred.reshape(-1, )-std, pred.reshape(-1, )+std, alpha=0.2)
    plt.scatter(X, y, c='k')
    plt.title('Initial estimation based on %d points' % n_initial)
    plt.show()

# active learning
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))

# plotting after active learning
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(14, 7))
Example #27
0
class ActiveKNN:
    """A KNN machine learning model using active learning with modAL package

    Attributes:
        amine:          A string representing the amine that the KNN model is used for predictions.
        n_neighbors:    An integer representing the number of neighbors to classify using KNN model.
        model:          A KNeighborClassifier object as the classifier model given the number of neighbors to classify
                            with.
        metrics:        A dictionary to store the performance metrics locally. It has the format of
                            {'metric_name': [metric_value]}.
        verbose:        A boolean representing whether it will prints out additional information to the terminal or not.
        pool_data:      A numpy array representing all the data from the dataset.
        pool_labels:    A numpy array representing all the labels from the dataset.
        x_t:            A numpy array representing the training data used for model training.
        y_t:            A numpy array representing the training labels used for model training.
        x_v:            A numpy array representing the testing data used for active learning.
        y_v:            A numpy array representing the testing labels used for active learning.
        learner:        An ActiveLearner to conduct active learning with. See modAL documentation for more details.
    """
    def __init__(self, amine=None, n_neighbors=2, verbose=True):
        """Initialize the ActiveKNN object."""
        self.amine = amine
        self.n_neighbors = n_neighbors
        self.model = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        self.metrics = {
            'accuracies': [],
            'precisions': [],
            'recalls': [],
            'bcrs': [],
            'confusion_matrices': []
        }
        self.verbose = verbose

    def load_dataset(self, x_t, y_t, x_v, y_v, all_data, all_labels):
        """Load the input training and validation data and labels into the model.

        Args:
            x_t:                A 2-D numpy array representing the training data.
            y_t:                A 2-D numpy array representing the training labels.
            x_v:                A 2-D numpy array representing the validation data.
            y_v:                A 2-D numpy array representing the validation labels.
            all_data:           A 2-D numpy array representing all the data in the active learning pool.
            all_labels:         A 2-D numpy array representing all the labels in the active learning pool.

        Returns:
            N/A
        """

        self.x_t, self.x_v, self.y_t, self.y_v = x_t, y_t, x_v, y_v

        self.pool_data = all_data
        self.pool_labels = all_labels

        if self.verbose:
            print(f'The training data has dimension of {self.x_t.shape}.')
            print(f'The training labels has dimension of {self.y_t.shape}.')
            print(f'The testing data has dimension of {self.x_v.shape}.')
            print(f'The testing labels has dimension of {self.y_v.shape}.')

    def train(self):
        """Train the KNN model by setting up the ActiveLearner."""

        self.learner = ActiveLearner(estimator=self.model,
                                     X_training=self.x_t,
                                     y_training=self.y_t)
        # Evaluate zero-point performance
        self.evaluate()

    def active_learning(self, num_iter=None, to_params=True):
        """ The active learning loop

        This is the active learning model that loops around the KNN model
        to look for the most uncertain point and give the model the label to train

        Args:
            num_iter:   An integer that is the number of iterations.
                        Default = None
            to_params:  A boolean that decide if to store the metrics to the dictionary,
                        detail see "store_metrics_to_params" function.
                        Default = True

        return: N/A
        """
        num_iter = num_iter if num_iter else self.x_v.shape[0]

        for _ in range(num_iter):
            # Query the most uncertain point from the active learning pool
            query_index, query_instance = self.learner.query(self.x_v)

            # Teach our ActiveLearner model the record it has requested.
            uncertain_data, uncertain_label = self.x_v[query_index].reshape(
                1, -1), self.y_v[query_index].reshape(1, )
            self.learner.teach(X=uncertain_data, y=uncertain_label)

            self.evaluate()

            # Remove the queried instance from the unlabeled pool.
            self.x_t = np.append(self.x_t, uncertain_data).reshape(
                -1, self.pool_data.shape[1])
            self.y_t = np.append(self.y_t, uncertain_label)
            self.x_v = np.delete(self.x_v, query_index, axis=0)
            self.y_v = np.delete(self.y_v, query_index)

        if to_params:
            self.store_metrics_to_params()

    def evaluate(self, store=True):
        """Evaluation of the model

        Args:
            store:  A boolean that decides if to store the metrics of the performance of the model.
                    Default = True

        return: N/A
        """

        # Calculate and report our model's accuracy.
        accuracy = self.learner.score(self.pool_data, self.pool_labels)

        preds = self.learner.predict(self.pool_data)
        cm = confusion_matrix(self.pool_labels, preds)

        # To prevent nan value for precision, we set it to 1 and send out a warning message
        if cm[1][1] + cm[0][1] != 0:
            precision = cm[1][1] / (cm[1][1] + cm[0][1])
        else:
            precision = 1.0
            print('WARNING: zero division during precision calculation')

        recall = cm[1][1] / (cm[1][1] + cm[1][0])
        true_negative = cm[0][0] / (cm[0][0] + cm[0][1])
        bcr = 0.5 * (recall + true_negative)

        if store:
            self.store_metrics_to_model(cm, accuracy, precision, recall, bcr)

    def store_metrics_to_model(self, cm, accuracy, precision, recall, bcr):
        """Store the performance metrics

        The metrics are specifically the confusion matrices, accuracies,
        precisions, recalls and balanced classification rates.

        Args:
            cm:             A numpy array representing the confusion matrix given our predicted labels and the actual
                                corresponding labels. It's a 2x2 matrix for the drp_chem model.
            accuracy:       A float representing the accuracy rate of the model: the rate of correctly predicted
                                reactions out of all reactions.
            precision:      A float representing the precision rate of the model: the rate of the number of actually
                                successful reactions out of all the reactions predicted to be successful.
            recall:         A float representing the recall rate of the model: the rate of the number of reactions
                                predicted to be successful out of all the actual successful reactions.
            bcr:            A float representing the balanced classification rate of the model. It's the average value
                                of recall rate and true negative rate.

        return: N/A
        """

        self.metrics['confusion_matrices'].append(cm)
        self.metrics['accuracies'].append(accuracy)
        self.metrics['precisions'].append(precision)
        self.metrics['recalls'].append(recall)
        self.metrics['bcrs'].append(bcr)

        if self.verbose:
            print(cm)
            print('accuracy for model is', accuracy)
            print('precision for model is', precision)
            print('recall for model is', recall)
            print('balanced classification rate for model is', bcr)

    def store_metrics_to_params(self):
        """Store the metrics results to the model's parameters dictionary

        Use the same logic of saving the metrics for each model.
        Dump the cross validation statistics to a pickle file.
        """

        model = 'KNN'

        with open(os.path.join("./data", "cv_statistics.pkl"), "rb") as f:
            stats_dict = pickle.load(f)

        stats_dict[model]['accuracies'].append(self.metrics['accuracies'])
        stats_dict[model]['confusion_matrices'].append(
            self.metrics['confusion_matrices'])
        stats_dict[model]['precisions'].append(self.metrics['precisions'])
        stats_dict[model]['recalls'].append(self.metrics['recalls'])
        stats_dict[model]['bcrs'].append(self.metrics['bcrs'])

        # Save this dictionary in case we need it later
        with open(os.path.join("./data", "cv_statistics.pkl"), "wb") as f:
            pickle.dump(stats_dict, f)

    def save_model(self, k_shot, n_way, meta):
        """Save the data used to train, validate and test the model to designated folder

        Args:
            k_shot:                 An integer representing the number of training samples per class.
            n_way:                  An integer representing the number of classes per task.
            meta:                   A boolean representing if it will be trained under option 1 or option 2.
                                        Option 1 is train with observations of other tasks and validate on the
                                        task-specific observations.
                                        Option 2 is to train and validate on the task-specific observations.

        Returns:
            N/A
        """

        # Indicate which option we used the data for
        option = 2 if meta else 1

        # Set up the main destination folder for the model
        dst_root = './KNN_few_shot/option_{0:d}'.format(option)
        if not os.path.exists(dst_root):
            os.makedirs(dst_root)
            print('No folder for KNN model storage found')
            print(f'Make folder to store KNN model at')

        # Set up the model specific folder
        model_folder = '{0:s}/KNN_{1:d}_shot_{2:d}_way_option_{3:d}_{4:s}'.format(
            dst_root, k_shot, n_way, option, self.amine)
        if not os.path.exists(model_folder):
            os.makedirs(model_folder)
            print('No folder for KNN model storage found')
            print(f'Make folder to store KNN model of amine {self.amine} at')
        else:
            print(
                f'Found existing folder. Model of amine {self.amine} will be stored at'
            )
        print(model_folder)

        # Dump the model into the designated folder
        file_name = "KNN_{0:s}_option_{1:d}.pkl".format(self.amine, option)
        with open(os.path.join(model_folder, file_name), "wb") as f:
            pickle.dump([self], f, -1)

    def __str__(self):
        return 'A {0:d}-neighbor KNN model for amine {1:s} using active learning'.format(
            self.n_neighbors, self.amine)
                        estimator=KNeighborsClassifier(n_neighbors=10),
                        query_strategy=strategy)

committee = Committee(
    learner_list=[member1, member2, member3, member4, member5])

# In[65]:

import math
unlab_length = X_unlab.shape[0]
disagreement = np.zeros(unlab_length * 2).reshape(unlab_length, 2)

for i in range(unlab_length):
    index = [i]
    predict = [-1, -1, -1, -1, -1]
    predict[0] = member1.predict(X_unlab[index])[0]
    predict[1] = member2.predict(X_unlab[index])[0]
    predict[2] = member3.predict(X_unlab[index])[0]
    predict[3] = member4.predict(X_unlab[index])[0]
    predict[4] = member5.predict(X_unlab[index])[0]

    if not (predict[0] == predict[1] == predict[2] == predict[3] ==
            predict[4]):
        disagreement[i][0] = 1
        count = [0, 0, 0]
        for j in range(5):
            count[predict[j]] += 1
        for j in range(3):
            if (count[j]):
                disagreement[i][1] -= (count[j] / 5) * math.log(count[j] / 5)
Example #29
0
y_pool = np.delete(y_raw, training_indices, axis=0)



from sklearn.neighbors import KNeighborsClassifier
from modAL.models import ActiveLearner

# Specify our core estimator along with it's active learning model.
knn = KNeighborsClassifier(n_neighbors=3)
learner = ActiveLearner(estimator=RandomForestClassifier(),
                        query_strategy=uncertainty_sampling,
                        X_training=X_train, y_training=y_train)


# Isolate the data we'll need for plotting.
predictions = learner.predict(X_raw)
is_correct = (predictions == y_raw)


# Record our learner's score on the raw data.
unqueried_score = learner.score(X_raw, y_raw)

# Plot our classification results.
'''
fig, ax = plt.subplots(figsize=(8.5, 6), dpi=130)
ax.scatter(x=x_component[is_correct],  y=y_component[is_correct],  c='g', marker='+', label='Correct',   alpha=8/10)
ax.scatter(x=x_component[~is_correct], y=y_component[~is_correct], c='r', marker='x', label='Incorrect', alpha=8/10)
ax.legend(loc='lower right')
ax.set_title("ActiveLearner class predictions (Accuracy: {score:.3f})".format(score=unqueried_score))
plt.show()
'''
Example #30
0
class ActiveLinearSVM:
    """A Linear SVM machine learning model using active learning with modAL package

    Attributes: 
        amine:          A string representing the amine this model is used for.
        model:          A CalibratedClassifierCV + LinearSVC object as the classifier model.
        metrics:        A dictionary to store the performance metrics locally. It has the format of
                            {'metric_name': [metric_value]}.
        verbose:        A boolean representing whether it will prints out additional information to the terminal or not.
        stats_path:     A Path object representing the directory of the stats dictionary.
        model_name:     A string representing the name of the model for future plotting.
        all_data:       A numpy array representing all the data from the dataset.
        all_labels:     A numpy array representing all the labels from the dataset.
        x_t:            A numpy array representing the training data used for model training.
        y_t:            A numpy array representing the training labels used for model training.
        x_v:            A numpy array representing the testing data used for active learning.
        y_v:            A numpy array representing the testing labels used for active learning.
        learner:        An ActiveLearner to conduct active learning with. See modAL documentation for more details.
        y_preds:        A numpy array representing the predicted labels given all data input.
    """
    def __init__(self,
                 amine=None,
                 config=None,
                 verbose=True,
                 stats_path=Path('./results/stats.pkl'),
                 model_name='LinearSVM'):
        """Initialization of the ActiveLinearSVM model"""

        self.amine = amine

        # Load customized model or use the default fine-tuned setting
        if config:
            self.model = CalibratedClassifierCV(LinearSVC(**config))
        else:
            # Fine tuned model
            self.model = CalibratedClassifierCV(LinearSVC())

        self.metrics = defaultdict(list)
        self.verbose = verbose
        self.stats_path = stats_path
        self.model_name = model_name

    def load_dataset(self, x_t, y_t, x_v, y_v, all_data, all_labels):
        """Load the input training and validation data and labels into the model.

        Args:
            x_t:                A 2-D numpy array representing the training data.
            y_t:                A 2-D numpy array representing the training labels.
            x_v:                A 2-D numpy array representing the validation data.
            y_v:                A 2-D numpy array representing the validation labels.
            all_data:           A 2-D numpy array representing all the data in the active learning pool.
            all_labels:         A 2-D numpy array representing all the labels in the active learning pool.
        """

        self.x_t, self.y_t, self.x_v, self.y_v = x_t, y_t, x_v, y_v

        self.all_data = all_data
        self.all_labels = all_labels

        if self.verbose:
            print(f'The training data has dimension of {self.x_t.shape}.')
            print(f'The training labels has dimension of {self.y_t.shape}.')
            print(f'The testing data has dimension of {self.x_v.shape}.')
            print(f'The testing labels has dimension of {self.y_v.shape}.')

    def train(self, warning=True):
        """ Train the LinearSVM model by setting up the ActiveLearner.

        """
        self.learner = ActiveLearner(estimator=self.model,
                                     X_training=self.x_t,
                                     y_training=self.y_t)
        # Evaluate zero-point performance
        self.evaluate(warning=warning)

    def active_learning(self, num_iter=None, warning=True, to_params=True):
        """The active learning loop

        This is the active learning model that loops around the decision tree model
        to look for the most uncertain point and give the model the label to train

        Args:
            num_iter:   An integer that is the number of iterations.
                        Default = None
            warning:    A boolean that decide if to declare zero division warning or not.
                        Default = True.
            to_params:  A boolean that decide if to store the metrics to the dictionary,
                        detail see "store_metrics_to_params" function.
                        Default = True
        """

        num_iter = num_iter if num_iter else self.x_v.shape[0]

        for _ in range(num_iter):
            # Query the most uncertain point from the active learning pool
            query_index, query_instance = self.learner.query(self.x_v)

            # Teach our ActiveLearner model the record it has requested.
            uncertain_data, uncertain_label = self.x_v[query_index].reshape(
                1, -1), self.y_v[query_index].reshape(1, )
            self.learner.teach(X=uncertain_data, y=uncertain_label)

            self.evaluate(warning=warning)

            # Remove the queried instance from the unlabeled pool.
            self.x_t = np.append(self.x_t, uncertain_data).reshape(
                -1, self.all_data.shape[1])
            self.y_t = np.append(self.y_t, uncertain_label)
            self.x_v = np.delete(self.x_v, query_index, axis=0)
            self.y_v = np.delete(self.y_v, query_index)

        if to_params:
            self.store_metrics_to_params()

    def evaluate(self, warning=True, store=True):
        """ Evaluation of the model

        Args:
            warning:    A boolean that decides if to warn about the zero division issue or not.
                            Default = True
            store:      A boolean that decides if to store the metrics of the performance of the model.
                            Default = True
        """
        # Calculate and report our model's accuracy.
        accuracy = self.learner.score(self.all_data, self.all_labels)

        # Find model predictions
        self.y_preds = self.learner.predict(self.all_data)

        # Calculated confusion matrix
        cm = confusion_matrix(self.all_labels, self.y_preds)

        # To prevent nan value for precision, we set it to 1 and send out a warning message
        if cm[1][1] + cm[0][1] != 0:
            precision = cm[1][1] / (cm[1][1] + cm[0][1])
        else:
            precision = 1.0
            if warning:
                print('WARNING: zero division during precision calculation')

        recall = cm[1][1] / (cm[1][1] + cm[1][0])
        true_negative = cm[0][0] / (cm[0][0] + cm[0][1])
        bcr = 0.5 * (recall + true_negative)

        if store:
            self.store_metrics_to_model(cm, accuracy, precision, recall, bcr)

    def store_metrics_to_model(self, cm, accuracy, precision, recall, bcr):
        """Store the performance metrics

        The metrics are specifically the confusion matrices, accuracies,
        precisions, recalls and balanced classification rates.

        Args:
            cm:             A numpy array representing the confusion matrix given our predicted labels and the actual
                                corresponding labels. It's a 2x2 matrix for the drp_chem model.
            accuracy:       A float representing the accuracy rate of the model: the rate of correctly predicted 
                                reactions out of all reactions.
            precision:      A float representing the precision rate of the model: the rate of the number of actually
                                successful reactions out of all the reactions predicted to be successful.
            recall:         A float representing the recall rate of the model: the rate of the number of reactions 
                                predicted to be successful out of all the actual successful reactions.
            bcr:            A float representing the balanced classification rate of the model. It's the average value 
                                of recall rate and true negative rate.
        """

        self.metrics['confusion_matrices'].append(cm)
        self.metrics['accuracies'].append(accuracy)
        self.metrics['precisions'].append(precision)
        self.metrics['recalls'].append(recall)
        self.metrics['bcrs'].append(bcr)

        if self.verbose:
            print(cm)
            print('accuracy for model is', accuracy)
            print('precision for model is', precision)
            print('recall for model is', recall)
            print('balanced classification rate for model is', bcr)

    def store_metrics_to_params(self):
        """Store the metrics results to the model's parameters dictionary

        Use the same logic of saving the metrics for each model.
        Dump the cross validation statistics to a pickle file.
        """

        model = self.model_name

        if self.stats_path.exists():
            with open(self.stats_path, "rb") as f:
                stats_dict = pickle.load(f)
        else:
            stats_dict = {}

        if model not in stats_dict:
            stats_dict[model] = defaultdict(list)

        stats_dict[model]['amine'].append(self.amine)
        stats_dict[model]['accuracies'].append(self.metrics['accuracies'])
        stats_dict[model]['confusion_matrices'].append(
            self.metrics['confusion_matrices'])
        stats_dict[model]['precisions'].append(self.metrics['precisions'])
        stats_dict[model]['recalls'].append(self.metrics['recalls'])
        stats_dict[model]['bcrs'].append(self.metrics['bcrs'])

        # Save this dictionary in case we need it later
        with open(self.stats_path, "wb") as f:
            pickle.dump(stats_dict, f)

    def save_model(self, model_name):
        """Save the data used to train, validate and test the model to designated folder

        Args:
            model_name:         A string representing the name of the model.
        """

        # Set up the main destination folder for the model
        dst_root = './data/LinearSVM/{0:s}'.format(model_name)
        if not os.path.exists(dst_root):
            os.makedirs(dst_root)
            print(f'No folder for LinearSVM model {model_name} storage found')
            print(f'Make folder to store model at')

        # Dump the model into the designated folder
        file_name = "{0:s}_{1:s}.pkl".format(model_name, self.amine)
        with open(os.path.join(dst_root, file_name), "wb") as f:
            pickle.dump(self, f)

    def __str__(self):
        return 'A LinearSVM model for {0:s} using active learning'.format(
            self.amine)
                            X_training=X_train,
                            y_training=y_train)
    learner_list.append(learner)

# assembling the committee
committee = Committee(learner_list=learner_list)

# visualizing the Committee's predictions per learner

with plt.style.context('seaborn-white'):
    plt.figure(figsize=(n_members * 7, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_members, learner_idx + 1)
        plt.scatter(x=pca[:, 0],
                    y=pca[:, 1],
                    c=learner.predict(iris['data']),
                    cmap='viridis',
                    s=50)
        plt.title('Learner no. %d initial predictions' % (learner_idx + 1))
    plt.show()

# visualizing the initial predictions
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = committee.predict(iris['data'])
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Committee initial predictions, accuracy = %1.3f' %
              committee.score(iris['data'], iris['target']))
    plt.show()

    # query by committee
Example #32
0
                            X_training=X_train,
                            y_training=y_train)
    learner_list.append(learner)

# assembling the committee
committee = Committee(learner_list=learner_list)

# visualizing the Committee's predictions per learner

with plt.style.context('seaborn-white'):
    plt.figure(figsize=(n_members * 7, 7))
    for learner_idx, learner in enumerate(committee):
        plt.subplot(1, n_members, learner_idx + 1)
        plt.scatter(x=pca[:, 0],
                    y=pca[:, 1],
                    c=learner.predict(data),
                    cmap='viridis',
                    s=50)
        plt.title('Learner no. %d initial predictions' % (learner_idx + 1))
    plt.show()

# visualizing the initial predictions
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = committee.predict(data)
    plt.scatter(x=pca[:, 0], y=pca[:, 1], c=prediction, cmap='viridis', s=50)
    plt.title('Committee initial predictions, accuracy = %1.3f' %
              committee.score(data, target))
    plt.show()

# query by committee
Example #33
0
class ActiveLearningClassifier:
    """Base machine learning classifier using active learning with modAL package

    Attributes:
        amine:              A string representing the amine that the Logistic Regression model is used for predictions.
        config:             A dictionary representing the hyper-parameters of the model
        metrics:            A dictionary to store the performance metrics locally. It has the format of
                                {'metric_name': [metric_value]}.
        verbose:            A boolean representing whether it will prints out additional information to the terminal
                                or not.
        stats_path:         A Path object representing the directory of the stats dictionary if we are not running
                                multi-processing.
        result_dict:        A dictionary representing the result dictionary used during multi-thread processing.
        classifier_name:    A string representing the name of the generic classifier.
        model_name:         A string representing the name of the specific model for future plotting.
        all_data:           A numpy array representing all the data from the dataset.
        all_labels:         A numpy array representing all the labels from the dataset.
        x_t:                A numpy array representing the training data used for model training.
        y_t:                A numpy array representing the training labels used for model training.
        x_v:                A numpy array representing the testing data used for active learning.
        y_v:                A numpy array representing the testing labels used for active learning.
        learner:            An ActiveLearner to conduct active learning with. See modAL documentation for more details.
    """
    def __init__(self,
                 amine=None,
                 config=None,
                 verbose=True,
                 stats_path=None,
                 result_dict=None,
                 classifier_name='Base Classifier',
                 model_name='Base Classifier'):
        """initialization of the class"""
        self.amine = amine

        self.config = config

        self.metrics = defaultdict(dict)
        self.verbose = verbose
        self.stats_path = stats_path
        self.result_dict = result_dict
        self.classifier_name = classifier_name
        self.model_name = model_name

    def load_dataset(self, set_id, x_t, y_t, x_v, y_v, all_data, all_labels):
        """Load the input training and validation data and labels into the model.

        Args:
            set_id:             An integer representing the id of the random draw that we are loading.
            x_t:                A 2-D numpy array representing the training data.
            y_t:                A 2-D numpy array representing the training labels.
            x_v:                A 2-D numpy array representing the validation data.
            y_v:                A 2-D numpy array representing the validation labels.
            all_data:           A 2-D numpy array representing all the data in the active learning pool.
            all_labels:         A 2-D numpy array representing all the labels in the active learning pool.
        """
        self.draw_id = set_id
        self.metrics[self.draw_id] = defaultdict(list)

        self.x_t, self.y_t, self.x_v, self.y_v = x_t, y_t, x_v, y_v

        self.all_data = all_data
        self.all_labels = all_labels

        if self.verbose:
            print(f'The training data has dimension of {self.x_t.shape}.')
            print(f'The training labels has dimension of {self.y_t.shape}.')
            print(f'The testing data has dimension of {self.x_v.shape}.')
            print(f'The testing labels has dimension of {self.y_v.shape}.')

    def train(self, warning=True):
        """Train the KNN model by setting up the ActiveLearner."""

        self.learner = ActiveLearner(estimator=self.model,
                                     X_training=self.x_t,
                                     y_training=self.y_t)
        # Evaluate zero-point performance
        self.evaluate(warning=warning)

    def active_learning(self, num_iter=None, warning=True):
        """The active learning loop

        This is the active learning model that loops around the decision tree model
        to look for the most uncertain point and give the model the label to train

        Args:
            num_iter:   An integer that is the number of iterations.
                        Default = None
            warning:    A boolean that decide if to declare zero division warning or not.
                        Default = True.
        """

        num_iter = num_iter if num_iter else self.x_v.shape[0]

        for _ in range(num_iter):
            # Query the most uncertain point from the active learning pool
            query_index, query_instance = self.learner.query(self.x_v)

            # Teach our ActiveLearner model the record it has requested.
            uncertain_data, uncertain_label = self.x_v[query_index].reshape(
                1, -1), self.y_v[query_index].reshape(1, )
            self.learner.teach(X=uncertain_data, y=uncertain_label)

            self.evaluate(warning=warning)

            # Remove the queried instance from the unlabeled pool.
            self.x_t = np.append(self.x_t, uncertain_data).reshape(
                -1, self.all_data.shape[1])
            self.y_t = np.append(self.y_t, uncertain_label)
            self.x_v = np.delete(self.x_v, query_index, axis=0)
            self.y_v = np.delete(self.y_v, query_index)

    def evaluate(self, warning=True, store=True):
        """Evaluation of the model

        Args:
            warning:    A boolean that decides if to warn about the zero division issue or not.
                            Default = True
            store:      A boolean that decides if to store the metrics of the performance of the model.
                            Default = True
        """

        # Calculate and report our model's accuracy.
        accuracy = self.learner.score(self.all_data, self.all_labels)

        self.y_preds = self.learner.predict(self.all_data)

        cm = confusion_matrix(self.all_labels, self.y_preds)

        # To prevent nan value for precision, we set it to 1 and send out a warning message
        if cm[1][1] + cm[0][1] != 0:
            precision = cm[1][1] / (cm[1][1] + cm[0][1])
        else:
            precision = 1.0
            if warning:
                print('WARNING: zero division during precision calculation')

        recall = cm[1][1] / (cm[1][1] + cm[1][0])
        true_negative = cm[0][0] / (cm[0][0] + cm[0][1])
        bcr = 0.5 * (recall + true_negative)

        if store:
            self.store_metrics_to_model(cm, accuracy, precision, recall, bcr)

    def store_metrics_to_model(self, cm, accuracy, precision, recall, bcr):
        """Store the performance metrics

        The metrics are specifically the confusion matrices, accuracies,
        precisions, recalls and balanced classification rates.

        Args:
            cm:             A numpy array representing the confusion matrix given our predicted labels and the actual
                                corresponding labels. It's a 2x2 matrix for the drp_chem model.
            accuracy:       A float representing the accuracy rate of the model: the rate of correctly predicted
                                reactions out of all reactions.
            precision:      A float representing the precision rate of the model: the rate of the number of actually
                                successful reactions out of all the reactions predicted to be successful.
            recall:         A float representing the recall rate of the model: the rate of the number of reactions
                                predicted to be successful out of all the actual successful reactions.
            bcr:            A float representing the balanced classification rate of the model. It's the average value
                                of recall rate and true negative rate.
        """

        self.metrics[self.draw_id]['confusion_matrices'].append(cm)
        self.metrics[self.draw_id]['accuracies'].append(accuracy)
        self.metrics[self.draw_id]['precisions'].append(precision)
        self.metrics[self.draw_id]['recalls'].append(recall)
        self.metrics[self.draw_id]['bcrs'].append(bcr)

        if self.verbose:
            print(cm)
            print('accuracy for model is', accuracy)
            print('precision for model is', precision)
            print('recall for model is', recall)
            print('balanced classification rate for model is', bcr)

    def find_inner_avg(self):
        """Find the average across all random draws"""
        metric_names = ['accuracies', 'precisions', 'recalls', 'bcrs']
        rand_draws = list(self.metrics.keys())

        for metric in metric_names:
            lst_of_metrics = []
            for set_id in rand_draws:
                lst_of_metrics.append(self.metrics[set_id][metric])
            self.metrics['average'][metric] = list(
                np.average(lst_of_metrics, axis=0))

        lst_of_confusion_matrices = []
        for set_id in rand_draws:
            lst_of_confusion_matrices.append(
                self.metrics[set_id]['confusion_matrices'])
        self.metrics['average'][
            'confusion_matrices'] = lst_of_confusion_matrices

    def store_metrics_to_file(self):
        """Store the metrics results to the model's parameters dictionary

        Use the same logic of saving the metrics for each model.
        Dump the cross validation statistics to a pickle file.
        """
        self.find_inner_avg()

        model = self.model_name

        # Check if we are running multi-thread process
        # Or single-thread process
        if self.result_dict:
            # Store to the existing multi-processing dictionary
            stats_dict = self.result_dict
        else:
            # Store to a simple dictionary
            if self.stats_path.exists():
                with open(self.stats_path, "rb") as f:
                    stats_dict = pickle.load(f)
            else:
                stats_dict = {}

        if model not in stats_dict:
            stats_dict[model] = defaultdict(list)

        stats_dict[model]['amine'].append(self.amine)
        stats_dict[model]['accuracies'].append(
            self.metrics['average']['accuracies'])
        stats_dict[model]['confusion_matrices'].append(
            self.metrics['average']['confusion_matrices'])
        stats_dict[model]['precisions'].append(
            self.metrics['average']['precisions'])
        stats_dict[model]['recalls'].append(self.metrics['average']['recalls'])
        stats_dict[model]['bcrs'].append(self.metrics['average']['bcrs'])

        # Save this dictionary in case we need it later
        if not self.result_dict and self.stats_path:
            with open(self.stats_path, "wb") as f:
                pickle.dump(stats_dict, f)

    def save_model(self):
        """Save the data used to train, validate and test the model to designated folder"""

        # Set up the main destination folder for the model
        dst_root = './data/{}/{}'.format(self.classifier_name, self.model_name)
        if not os.path.exists(dst_root):
            os.makedirs(dst_root)
            print(
                f'No folder for {self.classifier_name} model {self.model_name} storage found'
            )
            print(f'Make folder to store model at')

        # Dump the model into the designated folder
        file_name = "{0:s}_{1:s}.pkl".format(self.model_name, self.amine)
        with open(os.path.join(dst_root, file_name), "wb") as f:
            pickle.dump(self, f)
#plotting the data in an understandable form(kmeans)
f, ax = plt.subplots(figsize=(12, 8))
corr = train_k.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="summer",fmt='.2f')
f.subplots_adjust(top=.94)
t= f.suptitle('Zoo animals Heatmap', fontsize=16)

kmeans = KMeans(n_clusters=7, max_iter=10000)

X = np.array(train_k.drop(["class_type"], 1).astype(float))
Y = np.array(train_k["class_type"])

learner = ActiveLearner(estimator=kmeans, X_training=X, y_training=Y)

predictions = learner.predict(X_test)

X_pool = np.array(test_k.drop(["class_type"], 1).astype(float))
y_pool = np.array(test_k["class_type"]) - 1

for index in range(N_Queries[0]):
  query_index = random.randrange(0,len(X_pool))
  x, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
  learner.teach(X=x, y=y)
  X_pool, y_pool = np.delete(X_pool, query_index, axis=0), np.delete(y_pool, query_index)
    
model_accuracy = learner.score(X, Y)
print('Accuracy: {acc:0.4f} \n'.format(acc=model_accuracy))

print(predictions)