Example #1
0
    def _active_learning_update_metrics(
            self, active_learner: ActiveLearner, x_dev: np.ndarray,
            y_dev: Series, stats: Stats, data_for_plotting: List[Stats],
            i: int, elapsed_train: float, elapsed_query: float,
            labeled_indices: List[int],
            semi_sup: bool) -> Tuple[Stats, List[Stats], List[int]]:
        predicted = active_learner.predict(x_dev)
        scores = None if semi_sup else active_learner.predict_proba(x_dev)[:,
                                                                           1]
        metrics = self._get_metrics(actual=y_dev,
                                    predicted=predicted,
                                    scores=scores)

        data_for_plotting.append(
            self._get_plotting_row(i, metrics, elapsed_train, elapsed_query))
        metrics = util.add_prefix_to_dict_keys(metrics, f'sample_{i+1}_')
        if i + 1 in self.active_learning_log_intervals or i == -1:
            stats = util.merge_dicts(stats, metrics)
        return stats, data_for_plotting, labeled_indices
Example #2
0
class LearnerD():
    def __init__(self):

        self.is_ready_to_predict = False

        self.learner = ActiveLearner(
            estimator=RandomForestClassifier(n_estimators=100),
            query_strategy=uncertainty_sampling,
        )

    def predict_prob(self, point):
        if self.is_ready_to_predict is False:
            print('predict prob abort, learner is not ready to predict')
            return True, 0

        X = point
        positive_prob = self.learner.predict_proba(X)[0][1]
        negative_prob = self.learner.predict_proba(X)[0][0]
        #print('D learner proba', self.learner.predict_proba(X))
        return True, positive_prob

    def update(self, positive_points, negative_points):
        if len(positive_points) == 0 or len(negative_points) == 0:
            print('update abort, not enough data to update')
            self.is_ready_to_predict = False
            return False

        X = positive_points[0]
        y = np.ones(1)

        for i in range(1, len(positive_points)):
            X = np.concatenate((X, positive_points[i]), axis=0)
            y = np.concatenate((y, np.ones(1)), axis=0)

        for i in range(0, len(negative_points)):
            X = np.concatenate((X, negative_points[i]), axis=0)
            y = np.concatenate((y, np.zeros(1)), axis=0)

        self.learner.fit(X, y)
        self.is_ready_to_predict = True
        return True
Example #3
0
y_full = np.asarray([data[P[0], P[1]] for P in X_full])
X_pool = deepcopy(X_full)
y_pool = deepcopy(y_full)

# assembling initial training set
initial_idx = [
    0, im_height - 1, im_height * (im_height - 1), -1,
    im_width // 2 + im_height // 2 * im_height
]
X_train, y_train = X_pool[initial_idx], y_pool[initial_idx]

# create an ActiveLearner instance
learner = ActiveLearner(predictor=RandomForestClassifier(),
                        X_initial=X_train,
                        y_initial=y_train)
initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(
    im_height, im_width)

n_queries = 100
for round_idx in range(n_queries):
    query_idx, query_inst = learner.query(X_pool)
    learner.teach(X_pool[query_idx].reshape(1, -1),
                  y_pool[query_idx].reshape(-1, ))
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

final_prediction = learner.predict_proba(X_full)[:, 1].reshape(
    im_height, im_width)

# learning with randomly selected queries instead of active learning
random_idx = initial_idx + list(
    np.random.choice(range(len(X_full)), n_queries, replace=False))
# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False)
X_train, y_train = X_full[initial_idx], y_full[initial_idx]

# initialize the learner
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    X_training=X_train, y_training=y_train
)
print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))

# visualizing initial prediciton
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict_proba(X_full)[:, 1]
    plt.imshow(prediction.reshape(im_width, im_height))
    plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
    plt.show()

"""
The instances are randomly selected one by one, if an instance's uncertainty
is above a threshold, the label is requested and shown to the learner. The
process is continued until the learner reaches a previously defined accuracy.
"""

# learning until the accuracy reaches a given threshold
while learner.score(X_full, y_full) < 0.90:
    stream_idx = np.random.choice(range(len(X_full)))
    if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4:
        learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
Example #5
0
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf):
    unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf)
    label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    X_train, y_train = braid_AL.get_active_data(unlabel_feedback_info, unlabel_feature)
    X_feedback, y_feedback = braid_AL.get_active_data(label_feedback_info, choose_feature)

    # initializing the active learner
    learner = ActiveLearner(
        estimator=KNeighborsClassifier(n_neighbors=4),
        # estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=X_feedback, y_training=y_feedback
    )

    length = len(rec_api_test)
    predict, sel_query, add_unlabel_feature = [], [], []
    if len(unlabel_query) > 0:
        # pool-based sampling
        n_queries = 40
        for idx in range(n_queries):
            query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train)
            idx = int(query_idx/10)
            learner.teach(
                X=X_train[query_idx].reshape(1, -1),
                y=y_train[query_idx].reshape(1, )
            )

            # add queried instance into FR
            choose_query.append(unlabel_query[idx])
            choose_answer.append(unlabel_answer[idx])
            rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10])
            choose_feature.extend(unlabel_feature[idx*10:idx*10+10])

            # remove queried instance from pool
            for i in range(10):
                X_train = np.delete(X_train, idx*10, axis=0)
                y_train = np.delete(y_train, idx*10)
            del unlabel_query[idx]
            del unlabel_answer[idx]
            del rec_api_unlabel[idx*10:idx*10+10]
            del unlabel_feature[idx*10:idx*10+10]
            if len(X_train) == 0:
                break

    add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    new_X_feedback, new_y_feedback = braid_AL.get_active_data(add_label_feedback_info, choose_feature)
    learner = ActiveLearner(
        estimator=KNeighborsClassifier(n_neighbors=4),
        # estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=new_X_feedback, y_training=new_y_feedback
    )
    feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf)
    X = split_data.get_test_feature_matrix(feedback_info, test_feature)

    X_test = np.array(X)
    # 用反馈数据学习过后的模型来预测测试数据
    for query_idx in range(length):
        try:
            y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1))
        except ValueError:
            predict = [0.0 for n in range(length)]
        else:
            predict.append(float(y_pre[0, 1]))

    return predict, X, new_X_feedback, new_y_feedback
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary',
                             p_subsample: np.float = 1.0, n_instances: int = 1,
                             random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
    """
    Expected error reduction query strategy.

    References:
        Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)

    Args:
        learner: The ActiveLearner object for which the expected error
            is to be estimated.
        X: The samples.
        loss: The loss function to be used. Can be 'binary' or 'log'.
        p_subsample: Probability of keeping a sample from the pool when
            calculating expected error. Significantly improves runtime
            for large sample pools.
        n_instances: The number of instances to be sampled.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.


    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """

    assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
    assert loss in ['binary', 'log'], 'loss must be \'binary\' or \'log\''

    expected_error = np.zeros(shape=(len(X), ))
    possible_labels = np.unique(learner.y_training)

    try:
        X_proba = learner.predict_proba(X)
    except NotFittedError:
        # TODO: implement a proper cold-start
        return 0, X[0]

    cloned_estimator = clone(learner.estimator)

    for x_idx, x in enumerate(X):
        # subsample the data if needed
        if np.random.rand() <= p_subsample:
            # estimate the expected error
            for y_idx, y in enumerate(possible_labels):
                X_new = data_vstack((learner.X_training, x.reshape(1, -1)))
                y_new = data_vstack((learner.y_training, np.array(y).reshape(1, )))

                cloned_estimator.fit(X_new, y_new)
                refitted_proba = cloned_estimator.predict_proba(X)
                if loss is 'binary':
                    loss = _proba_uncertainty(refitted_proba)
                elif loss is 'log':
                    loss = _proba_entropy(refitted_proba)

                expected_error[x_idx] += np.sum(loss)*X_proba[x_idx, y_idx]

        else:
            expected_error[x_idx] = np.inf

    if not random_tie_break:
        query_idx = multi_argmax(expected_error, n_instances)
    else:
        query_idx = shuffled_argmax(expected_error, n_instances)

    return query_idx, X[query_idx]
class Review(ABC):
    """Base class for Systematic Review"""
    def __init__(self,
                 X,
                 y=None,
                 model=None,
                 query_strategy=None,
                 train_data_fn=full_sample,
                 n_instances=1,
                 n_queries=None,
                 prior_included=[],
                 prior_excluded=[],
                 log_file=None,
                 settings={},
                 verbose=1):
        super(Review, self).__init__()

        self.X = X
        self.y = y
        self.model = model
        self.query_strategy = query_strategy
        self.train_data = train_data_fn

        self.n_instances = n_instances
        self.n_queries = n_queries
        self.log_file = log_file
        self.verbose = verbose

        self.prior_included = prior_included
        self.prior_excluded = prior_excluded

        self.fit_kwargs = settings['fit_kwargs']
        self.balance_kwargs = settings['balance_kwargs']
        self.query_kwargs = settings['query_kwargs']

        self._logger = Logger()

    @abstractmethod
    def _prior_knowledge(self):
        pass

    @abstractmethod
    def _classify(self, ind):
        """Classify the provided indices."""
        pass

    def _prior_teach(self):
        """Function called before training model."""

        pass

    def _stop_iter(self, query_i, pool):
        """Criteria for stopping iteration.

        Stop iterating if:
            - n_queries is reached
            - the pool is empty
        """

        stop_iter = False

        # if the pool is empty, always stop
        if len(pool) == 0:
            stop_iter = True

        # don't stop if there is no stopping criteria
        if self.n_queries is not None and query_i >= self.n_queries:
            stop_iter = True

        return stop_iter

    def review(self):

        # create the pool and training indices.
        n_samples = self.X.shape[0]
        pool_idx = np.arange(n_samples)

        # add prior knowledge
        init_idx, init_labels = self._prior_knowledge()
        self.y[init_idx] = init_labels

        # remove the initial sample from the pool
        pool_idx = np.delete(pool_idx, init_idx)

        # Initialize learner, but don't start training yet.
        self.learner = ActiveLearner(estimator=self.model,
                                     query_strategy=self.query_strategy)
        query_i = 0
        train_idx = init_idx.copy()
        query_idx = train_idx
        self._logger.add_labels(self.y)

        while not self._stop_iter(query_i - 1, pool_idx):
            self._logger.add_training_log(query_idx, self.y[query_idx])

            # Get the training data.
            X_train, y_train = self.train_data(self.X, self.y, train_idx,
                                               **self.balance_kwargs)
            #             validation_data(self.X[pool_idx], self.y[pool_idx],
            #                             self.fit_kwargs, ratio=1)

            # Train the model on the training data.
            self.learner.teach(X=X_train,
                               y=y_train,
                               only_new=True,
                               **self.fit_kwargs)

            # Make a query from the pool.
            query_idx, _ = self.learner.query(X=self.X,
                                              pool_idx=pool_idx,
                                              n_instances=min(
                                                  self.n_instances,
                                                  len(pool_idx)),
                                              query_kwargs=self.query_kwargs)

            # Log the probabilities of samples in the pool being included.
            pred_proba = self.query_kwargs.get('pred_proba', [])
            if len(pred_proba) == 0:
                pred_proba = self.learner.predict_proba(self.X[pool_idx])
            self._logger.add_proba(pool_idx, pred_proba)

            # Log the probabilities of samples that were trained.
            pred_proba_train = self.learner.predict_proba(self.X[train_idx])
            self._logger.add_proba(train_idx,
                                   pred_proba_train,
                                   logname="train_proba")

            # Classify the queried papers.
            self.y[query_idx] = self._classify(query_idx)
            self._logger.add_labels(self.y)

            # Update training/pool indices
            train_idx = np.append(train_idx, query_idx)
            pool_idx = np.delete(np.arange(n_samples), train_idx, axis=0)

            # update the query counter
            query_i += 1

            # Save the result to a file
            if self.log_file:
                self.save_logs(self.log_file)
                if self.verbose:
                    print(f"Saved results in log file: {self.log_file}")

    def save_logs(self, *args, **kwargs):
        """Save the logs to a file."""

        self._logger.save(*args, **kwargs)
def get_AL_predict(test_feature, choose_feature, unlabel_feature, test_query, choose_query, choose_answer, unlabel_query, unlabel_answer, rec_api_test, rec_api_choose, rec_api_unlabel, w2v, idf):
    unlabel_feedback_info = feedback.get_feedback_inf(unlabel_query, choose_query, choose_answer, rec_api_unlabel, w2v, idf)
    label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    X_train, y_train = get_active_data(unlabel_feedback_info, unlabel_feature)
    X_feedback, y_feedback = get_active_data(label_feedback_info, choose_feature)

    # initializing the active learner
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=X_feedback, y_training=y_feedback
    )

    predict, sel_query, add_unlabel_feature = [], [], []
    if len(unlabel_query) > 0:
        # pool-based sampling
        n_queries = 100
        sel_idx, sel_label = [], []
        for idx in range(n_queries):
            # query_idx, query_instance = learner.query(X=X_train)
            query_idx, query_instance = uncertainty_sampling(classifier=learner, X=X_train)
            idx = int(query_idx/10)
            # print(idx, len(X_train))
            # print('uncertain', query_idx, X_train[query_idx], y_train[query_idx])
            learner.teach(
                X=X_train[query_idx].reshape(1, -1),
                y=y_train[query_idx].reshape(1, )
            )

            # add queried instance into FR
            choose_query.append(unlabel_query[idx])
            choose_answer.append(unlabel_answer[idx])
            rec_api_choose.extend(rec_api_unlabel[idx*10:idx*10+10])
            choose_feature.extend(unlabel_feature[idx*10:idx*10+10])
            # learner.teach(
            #     X=new_X_train.reshape(1, -1),
            #     y=new_y_train.reshape(1, )
            # )
            # print(unlabel_query[idx], unlabel_query[idx], rec_api_unlabel[idx*10:idx*10+10], rec_api_unlabel[idx*10:idx*10+10])

            # remove queried instance from pool
            for i in range(10):
                X_train = np.delete(X_train, idx*10, axis=0)
                y_train = np.delete(y_train, idx*10)
            del unlabel_query[idx]
            del unlabel_answer[idx]
            del rec_api_unlabel[idx*10:idx*10+10]
            del unlabel_feature[idx*10:idx*10+10]
            if len(X_train) == 0:
                break

    add_label_feedback_info = feedback.get_feedback_inf(choose_query, choose_query, choose_answer, rec_api_choose, w2v, idf)
    new_X_feedback, new_y_feedback = get_active_data(add_label_feedback_info, choose_feature)
    learner = ActiveLearner(
        # estimator=KNeighborsClassifier(n_neighbors=4),
        estimator=LogisticRegression(penalty='l1', solver='liblinear'),
        X_training=new_X_feedback, y_training=new_y_feedback
    )
    feedback_info = feedback.get_feedback_inf(test_query, choose_query, choose_answer, rec_api_test, w2v, idf)
    X = split_data.get_test_feature_matrix(feedback_info, test_feature)

    X_test = np.array(X)
    # 用反馈数据学习过后的模型来预测测试数据
    for query_idx in range(400):
        y_pre = learner.predict_proba(X=X_test[query_idx].reshape(1, -1))
        predict.append(float(y_pre[0, 1]))
        # predict.append(math.log(float(y_pre[0, 1])+1))
        # predict.extend(y_pre.tolist())
        x = X_test[query_idx].reshape(1, -1)
    # print(predict)
    # print('new_choose', len(choose_query), len(choose_answer))
    # fw = open('../data/add_FR.csv', 'a+', newline='')
    # writer = csv.writer(fw)
    # for i, fr_q in enumerate(choose_query):
    #     writer.writerow((fr_q, choose_answer[i]))
    # fw.close()

    return predict, X, new_X_feedback, new_y_feedback #sorted(sel_query)
Example #9
0
class BaseReview(ABC):
    """Base class for Systematic Review"""
    def __init__(self,
                 X,
                 y=None,
                 model=None,
                 query_strategy=max_sampling,
                 train_data_fn=full_sample,
                 n_instances=1,
                 n_queries=1,
                 prior_included=[],
                 prior_excluded=[],
                 log_file=None,
                 fit_kwargs={},
                 balance_kwargs={},
                 query_kwargs={},
                 logger=None,
                 verbose=1):
        super(BaseReview, self).__init__()

        self.X = X
        self.y = y
        if y is None:
            self.y = np.full(X.shape[0], NOT_AVAILABLE)

        # Default to Naive Bayes model
        if model is None:
            print("Warning: using naive Bayes model as default."
                  "If you experience bad performance, read the documentation"
                  " in order to implement a RNN based solution.")
            from asreview.models import create_nb_model
            model = create_nb_model()

        self.model = model
        self.query_strategy = query_strategy
        self.train_data = train_data_fn

        self.n_instances = n_instances
        self.n_queries = n_queries
        self.log_file = log_file
        self.verbose = verbose

        self.prior_included = prior_included
        self.prior_excluded = prior_excluded

        self.fit_kwargs = fit_kwargs
        self.balance_kwargs = balance_kwargs
        self.query_kwargs = query_kwargs

        self.query_i = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        self.query_kwargs["src_query_idx"] = {}

        if logger is None:
            self._logger = Logger()
            self.start_from_logger = False
        else:
            self._logger = logger
            self._prepare_with_logger()
            self.start_from_logger = True

        # Initialize learner, but don't start training yet.
        self.learner = ActiveLearner(estimator=self.model,
                                     query_strategy=self.query_strategy)

    @classmethod
    def from_logger(cls, *args, **kwargs):
        reviewer = cls(*args, **kwargs)
        reviewer._prepare_with_logger()
        return reviewer

    @abstractmethod
    def _prior_knowledge(self):
        pass

    @abstractmethod
    def _get_labels(self, ind):
        """Classify the provided indices."""
        pass

    def _prior_teach(self):
        """Function called before training model."""

        pass

    def _stop_iter(self, query_i, n_pool):
        """Criteria for stopping iteration.

        Stop iterating if:
            - n_queries is reached
            - the pool is empty
        """

        stop_iter = False

        # if the pool is empty, always stop
        if n_pool == 0:
            stop_iter = True

        # don't stop if there is no stopping criteria
        if self.n_queries is not None and query_i >= self.n_queries:
            stop_iter = True

        return stop_iter

    def _prepare_with_logger(self):
        """ If we start the reviewer from a log file, we need to do some
            preparation work. The final result should be a log dictionary in
            a state where the labeled papares are one step ahead of the
            probabilities. Any excess probabilities (pool_proba and
            train_proba) are thrown away and recomputed.

        Returns
        -------
        tuple:
            The query index, training indices and pool_indices.
        """
        query_i = 0
        train_idx = []
        if "labels" in self._logger._log_dict:
            self.y = np.array(self._logger._log_dict["labels"])
        qk = query_key(query_i)

        # Capture the labelled indices from the log file.
        while qk in self._logger._log_dict:
            new_labels = self._logger._log_dict[qk]["labelled"]
            label_idx = [x[0] for x in new_labels]
            inclusions = [x[1] for x in new_labels]
            self.y[label_idx] = inclusions
            train_idx.extend(label_idx)
            query_i += 1
            qk = query_key(query_i)
        query_i -= 1

        # Throw away the last probabilities if they have the same key
        # as the query. These values should be overwritten, since we're
        # starting out by training the model again.
        if query_i >= 0:
            qk = query_key(query_i)
            self._logger._log_dict[qk].pop("pool_proba", None)
            self._logger._log_dict[qk].pop("train_proba", None)

        self.train_idx = np.array(train_idx, dtype=np.int)
        self.query_i = query_i
        self.query_kwargs["src_query_idx"] = self._logger.get_src_query_idx()

    def review(self, stop_after_class=True):
        """ Do the systematic review, writing the results to the log file. """

        if not self.start_from_logger:
            # add prior knowledge
            init_idx, init_labels = self._prior_knowledge()
            self.y[init_idx] = init_labels

            self.query_i = 0
            self.train_idx = init_idx.copy()

            self._logger.add_labels(self.y)
            self.query_kwargs['last_bounds'] = [("random", 0, len(init_idx))]
            self.log_query(init_idx)

        # train the algorithm with prior knowledge
        self.train()
        if self.model_trained:
            self.log_probabilities()
        n_pool = self.X.shape[0] - len(self.train_idx)

        while not self._stop_iter(self.query_i - 1, n_pool):

            # STEP 1: Make a new query
            query_idx = self.query(n_instances=min(self.n_instances, n_pool))

            # STEP 2: Classify the queried papers.
            self.y[query_idx] = self._get_labels(query_idx)
            self._logger.add_labels(self.y)

            # STEP 3: Run inference (if necessary) and log the probabilities of
            # the model.
            self.train_idx = np.append(self.train_idx, query_idx)
            self.log_query(query_idx)

            # Option to stop after the classification set instead of training.
            if stop_after_class and self._stop_iter(self.query_i, n_pool):
                return

            # STEP 4: Train the algorithm with new data
            # Update the training data and pool afterwards
            self.train()
            if self.model_trained:
                self.log_probabilities()

            # STEP 5: Write all results to the logger
            # Update the query counter
            self.query_i += 1
            n_pool = self.X.shape[0] - len(self.train_idx)

            # Save the result to a file
            if self.log_file:
                self.save_logs(self.log_file)
                if self.verbose:
                    print(f"Saved results in log file: {self.log_file}")

    def log_probabilities(self):
        pool_idx = get_pool_idx(self.X, self.train_idx)

        # Log the probabilities of samples in the pool being included.
        pred_proba = self.query_kwargs.get('pred_proba', np.array([]))
        if len(pred_proba) == 0:
            pred_proba = self.learner.predict_proba(self.X)
        self._logger.add_proba(pool_idx, pred_proba[pool_idx])

        # Log the probabilities of samples that were trained.
        self._logger.add_proba(self.train_idx,
                               pred_proba[self.train_idx],
                               logname="train_proba")

    def log_query(self, query_idx):
        self._logger.add_training_log(query_idx, self.y[query_idx])
        self._logger.add_query_info(self.query_kwargs)

    def query(self, n_instances):
        """Query new results."""

        pool_idx = get_pool_idx(self.X, self.train_idx)

        n_instances = min(n_instances, len(pool_idx))
        if not self.model_trained:
            query_idx = pool_idx[np.random.choice(len(pool_idx), n_instances)]
        else:
            # Make a query from the pool.
            query_idx, _ = self.learner.query(X=self.X,
                                              pool_idx=pool_idx,
                                              n_instances=n_instances,
                                              query_kwargs=self.query_kwargs)
        return query_idx

    def classify(self, query_idx, inclusions):
        self.y[query_idx] = inclusions
        self.train_idx = np.unique(np.append(self.train_idx, query_idx))
        self._logger.add_training_log(query_idx, inclusions)

    def train(self):
        """Teach the algorithm with new data."""

        num_zero = np.count_nonzero(self.y == 0)
        num_one = np.count_nonzero(self.y == 1)
        if num_zero == 0 or num_one == 0:
            return
        # Get the training data.
        X_train, y_train = self.train_data(self.X, self.y, self.train_idx,
                                           **self.balance_kwargs)

        # Train the model on the training data.
        self.learner.teach(X=X_train,
                           y=y_train,
                           only_new=True,
                           **self.fit_kwargs)
        self.query_kwargs["pred_proba"] = self.learner.predict_proba(self.X)
        self.model_trained = True

    def save_logs(self, *args, **kwargs):
        """Save the logs to a file."""

        self._logger.save(*args, **kwargs)

    def to_pickle(self, pickle_fp):
        try:
            with open(pickle_fp, "wb") as fp:
                dill.dump(self, fp)
        except TypeError:
            model_fp = os.path.splitext(pickle_fp)[0] + ".h5"
            self.model.model.save(model_fp)
            current_model = self.model.__dict__.pop("model", None)
            with open(pickle_fp, "wb") as fp:
                dill.dump(self, fp)
            setattr(self.model, "model", current_model)

    @classmethod
    def from_pickle(cls, pickle_fp):
        with open(pickle_fp, "rb") as fp:
            my_instance = dill.load(fp)
        try:
            model_fp = os.path.splitext(pickle_fp)[0] + ".h5"
            current_model = load_model(model_fp)
            setattr(my_instance.model, "model", current_model)
        except BaseException:
            pass
        return my_instance
Example #10
0
# assembling initial training set
n_initial = 5
initial_idx = np.random.choice(range(len(X_full)), size=n_initial, replace=False)
X_train, y_train = X_full[initial_idx], y_full[initial_idx]

# initialize the learner
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)
print('Initial prediction accuracy: %f' % learner.score(X_full, y_full))

# visualizing initial prediciton
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(7, 7))
    prediction = learner.predict_proba(X_full)[:, 1]
    plt.imshow(prediction.reshape(im_width, im_height))
    plt.title('Initial prediction accuracy: %f' % learner.score(X_full, y_full))
    plt.show()

"""
The instances are randomly selected one by one, if an instance's uncertainty
is above a threshold, the label is requested and shown to the learner. The
process is continued until the learner reaches a previously defined accuracy.
"""

# learning until the accuracy reaches a given threshold
while learner.score(X_full, y_full) < 0.90:
    stream_idx = np.random.choice(range(len(X_full)))
    if classifier_uncertainty(learner, X_full[stream_idx].reshape(1, -1)) >= 0.4:
        learner.teach(X_full[stream_idx].reshape(1, -1), y_full[stream_idx].reshape(-1, ))
Example #11
0
)
# map the intensity values against the grid
y_full = np.asarray([data[P[0], P[1]] for P in X_full])
X_pool = deepcopy(X_full)
y_pool = deepcopy(y_full)

# assembling initial training set
initial_idx = [0, im_height-1, im_height*(im_height-1), -1, im_width//2 + im_height//2*im_height]
X_train, y_train = X_pool[initial_idx], y_pool[initial_idx]

# create an ActiveLearner instance
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)
initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)

n_queries = 100
for round_idx in range(n_queries):
    query_idx, query_inst = learner.query(X_pool)
    learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, ))
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)

# learning with randomly selected queries instead of active learning
random_idx = initial_idx + list(np.random.choice(range(len(X_full)), n_queries, replace=False))
X_train, y_train = X_full[initial_idx], y_full[initial_idx]
random_learner = ActiveLearner(
    predictor=RandomForestClassifier(),
Example #12
0
for index in range(N_QUERIES):
    query_index, query_instance = learner.query(X_pool)
    X, y = X_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    learner.teach(X=X, y=y)
    X_pool, y_pool = np.delete(X_pool, query_index,
                               axis=0), np.delete(y_pool, query_index)
    model_accuracy = learner.score(X_raw, y_raw)
    print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1,
                                                        acc=model_accuracy))
    performance_history.append(model_accuracy)
    q_index.append(int(query_index))

result = []
for i in range(len(X_pool)):
    data_for_prediction_array = (X_pool[i].reshape(1, -1))
    result.append(learner.predict_proba(data_for_prediction_array))
con = []
con_1 = []
for i in range(len(result)):
    if (result[i][0][0] > result[i][0][1]):
        con.append(result[i][0][0])
        con_1.append(result[i][0][1])
    else:
        con.append(result[i][0][1])
        con_1.append(result[i][0][1])
all = [i for i in list(all) if i not in list(training_indices)]
for i in q_index:
    del all[i]
data = {
    'Attack-Stage': df.iloc[all, 0],
    'Port-Service': df.iloc[all, 1],
Example #13
0
class BaseReview(ABC):
    """Base class for Systematic Review"""
    def __init__(self,
                 X,
                 y=None,
                 model=None,
                 query_strategy=max_sampling,
                 train_data_fn=full_sample,
                 n_papers=None,
                 n_instances=DEFAULT_N_INSTANCES,
                 n_queries=None,
                 prior_included=[],
                 prior_excluded=[],
                 log_file=None,
                 fit_kwargs={},
                 balance_kwargs={},
                 query_kwargs={},
                 logger=None,
                 verbose=1):
        super(BaseReview, self).__init__()

        self.X = X
        self.y = y
        if y is None:
            self.y = np.full(X.shape[0], NOT_AVAILABLE)
        self.y = np.array(self.y, dtype=np.int)
        # Default to Naive Bayes model
        if model is None:
            print("Warning: using naive Bayes model as default."
                  "If you experience bad performance, read the documentation"
                  " in order to implement a RNN based solution.")
            from asreview.models import create_nb_model
            model = create_nb_model()

        self.model = model
        self.query_strategy = query_strategy
        self.train_data = train_data_fn

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.log_file = log_file
        self.verbose = verbose

        self.prior_included = prior_included
        self.prior_excluded = prior_excluded

        self.fit_kwargs = fit_kwargs
        self.balance_kwargs = balance_kwargs
        self.query_kwargs = query_kwargs

        self.query_i = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        self.query_kwargs["query_src"] = {}
        self.query_kwargs["current_queries"] = {}

        if logger is None:
            self._logger = Logger()
            self.start_from_logger = False
        else:
            self._logger = logger
            self._prepare_with_logger()
            self.start_from_logger = True

        # Initialize learner, but don't start training yet.
        self.learner = ActiveLearner(estimator=self.model,
                                     query_strategy=self.query_strategy)

        if not self.start_from_logger:
            # add prior knowledge
            init_idx, init_labels = self._prior_knowledge()
            self.query_i = 0
            self.train_idx = np.array([], dtype=np.int)
            self.classify(init_idx, init_labels, method="initial")

    @classmethod
    def from_logger(cls, *args, **kwargs):
        reviewer = cls(*args, **kwargs)
        reviewer._prepare_with_logger()
        return reviewer

    @abstractmethod
    def _prior_knowledge(self):
        pass

    @abstractmethod
    def _get_labels(self, ind):
        """Classify the provided indices."""
        pass

    def _prior_teach(self):
        """Function called before training model."""
        pass

    def _stop_iter(self, query_i, n_pool):
        """Criteria for stopping iteration.

        Stop iterating if:
            - n_queries is reached
            - the pool is empty
        """

        stop_iter = False
        n_train = self.X.shape[0] - n_pool

        # if the pool is empty, always stop
        if n_pool == 0:
            stop_iter = True

        # If we are exceeding the number of papers, stop.
        if self.n_papers is not None and n_train >= self.n_papers:
            stop_iter = True

        # don't stop if there is no stopping criteria
        if self.n_queries is not None and query_i >= self.n_queries:
            stop_iter = True

        return stop_iter

    def n_pool(self):
        return self.X.shape[0] - len(self.train_idx)

    def _next_n_instances(self):  # Could be merged with _stop_iter someday.
        """ Get the batch size for the next query. """
        n_instances = self.n_instances
        n_pool = self.n_pool()

        n_instances = min(n_instances, n_pool)
        if self.n_papers is not None:
            papers_left = self.n_papers - len(self.train_idx)
            n_instances = min(n_instances, papers_left)
        return n_instances

    def _prepare_with_logger(self):
        """ If we start the reviewer from a log file, we need to do some
            preparation work. The final result should be a log dictionary in
            a state where the labeled papares are one step ahead of the
            probabilities. Any excess probabilities (pool_proba and
            train_proba) are thrown away and recomputed.

        Returns
        -------
        tuple:
            The query index, training indices and pool_indices.
        """
        query_i = 0
        train_idx = []
        if "labels" in self._logger._log_dict:
            self.y = np.array(self._logger._log_dict["labels"], dtype=np.int)
        qk = query_key(query_i)

        # Capture the labelled indices from the log file.
        while qk in self._logger._log_dict:
            if "labelled" not in self._logger._log_dict[qk]:
                query_i += 1
                qk = query_key(query_i)
                continue
            new_labels = self._logger._log_dict[qk]["labelled"]
            label_methods = self._logger._log_dict[qk]["label_methods"]
            label_idx = [x[0] for x in new_labels]
            inclusions = [x[1] for x in new_labels]
            self.y[label_idx] = inclusions
            train_idx.extend(label_idx)

            # Update the internal query sources.
            start_idx = 0
            for method in label_methods:
                if method[0] not in self.query_kwargs["query_src"]:
                    self.query_kwargs["query_src"][method[0]] = []
                self.query_kwargs["query_src"][method[0]].extend(
                    label_idx[start_idx:start_idx + method[1]])
                start_idx += method[1]
            query_i += 1
            qk = query_key(query_i)

        query_i -= 1
        if query_i > 0:
            qk = query_key(query_i)
            if "labelled" not in self._logger._log_dict[qk]:
                query_i -= 1

        self.train_idx = np.array(train_idx, dtype=np.int)
        self.query_i = query_i

    def review(self, stop_after_class=True, instant_save=False):
        """ Do the systematic review, writing the results to the log file. """

        if self._stop_iter(self.query_i, self.n_pool()):
            return

        # train the algorithm with prior knowledge
        self.train()
        if self.model_trained:
            self.log_probabilities()
            if self.log_file:
                self.save_logs(self.log_file)

        n_pool = self.X.shape[0] - len(self.train_idx)

        while not self._stop_iter(self.query_i - 1, n_pool):

            # STEP 1: Make a new query
            query_idx = self.query(n_instances=self._next_n_instances())

            # STEP 2: Classify the queried papers.
            if instant_save:
                for idx in query_idx:
                    idx_array = np.array([idx], dtype=np.int)
                    self.classify(idx_array, self._get_labels(idx_array))
            else:
                self.classify(query_idx, self._get_labels(query_idx))

            # Option to stop after the classification set instead of training.
            if stop_after_class and self._stop_iter(self.query_i,
                                                    self.n_pool()):
                if self.log_file:
                    self.save_logs(self.log_file)
                    if self.verbose:
                        print(f"Saved results in log file: {self.log_file}")
                return

            # STEP 3: Train the algorithm with new data
            # Update the training data and pool afterwards
            self.train()
            if self.model_trained:
                self.log_probabilities()

            # STEP 4: Save the logs.
            if self.log_file:
                self.save_logs(self.log_file)
                if self.verbose:
                    print(f"Saved results in log file: {self.log_file}")

    def log_probabilities(self):
        """ Store the modeling probabilities of the training indices and
            pool indices. """
        pool_idx = get_pool_idx(self.X, self.train_idx)

        # Log the probabilities of samples in the pool being included.
        pred_proba = self.query_kwargs.get('pred_proba', np.array([]))
        if len(pred_proba) == 0:
            pred_proba = self.learner.predict_proba(self.X)
        self._logger.add_proba(pool_idx,
                               pred_proba[pool_idx],
                               logname="pool_proba",
                               i=self.query_i)

        # Log the probabilities of samples that were trained.
        self._logger.add_proba(self.train_idx,
                               pred_proba[self.train_idx],
                               logname="train_proba",
                               i=self.query_i)

    def query(self, n_instances):
        """Query new results."""

        pool_idx = get_pool_idx(self.X, self.train_idx)

        n_instances = min(n_instances, len(pool_idx))

        # If the model is not trained, choose random papers.
        if not self.model_trained:
            query_idx, _ = random_sampling(None,
                                           X=self.X,
                                           pool_idx=pool_idx,
                                           n_instances=n_instances,
                                           query_kwargs=self.query_kwargs)

        else:
            # Make a query from the pool.
            query_idx, _ = self.learner.query(X=self.X,
                                              pool_idx=pool_idx,
                                              n_instances=n_instances,
                                              query_kwargs=self.query_kwargs)
        return query_idx

    def classify(self, query_idx, inclusions, method=None):
        """ Classify new papers and update the training indices. """
        query_idx = np.array(query_idx, dtype=np.int)
        self.y[query_idx] = inclusions
        query_idx = query_idx[np.isin(query_idx, self.train_idx, invert=True)]
        self.train_idx = np.append(self.train_idx, query_idx)
        if method is None:
            methods = []
            for idx in query_idx:
                method = self.query_kwargs["current_queries"].pop(idx, None)
                if method is None:
                    method = "unknown"
                methods.append([idx, method])
                if method in self.query_kwargs["query_src"]:
                    self.query_kwargs["query_src"][method].append(idx)
                else:
                    self.query_kwargs["query_src"][method] = [idx]
        else:
            methods = [[idx, method] for idx in query_idx]
            if method in self.query_kwargs["query_src"]:
                self.query_kwargs["query_src"][method].extend(
                    query_idx.tolist())
            else:
                self.query_kwargs["query_src"][method] = query_idx.tolist()

        self._logger.add_classification(query_idx,
                                        inclusions,
                                        methods=methods,
                                        i=self.query_i)
        self._logger.add_labels(self.y)

    def train(self):
        """ Train the model. """

        num_zero = np.count_nonzero(self.y[self.train_idx] == 0)
        num_one = np.count_nonzero(self.y[self.train_idx] == 1)
        if num_zero == 0 or num_one == 0:
            return

        # Get the training data.
        X_train, y_train = self.train_data(self.X, self.y, self.train_idx,
                                           **self.balance_kwargs)

        # Train the model on the training data.
        self.learner.teach(X=X_train,
                           y=y_train,
                           only_new=True,
                           **self.fit_kwargs)
        self.query_kwargs["pred_proba"] = self.learner.predict_proba(self.X)
        self.model_trained = True
        self.query_i += 1

    def statistics(self):
        n_initial = 0
        try:
            initial_meth = self._logger._log_dict["0"]["label_methods"][0]
            if initial_meth[0] == "initial":
                n_initial = initial_meth[1]
        except (IndexError, KeyError):
            pass

        try:
            if np.count_nonzero(self.y[self.train_idx[n_initial:]] == 1) == 0:
                last_inclusion = len(self.train_idx[n_initial:])
            else:
                last_inclusion = np.nonzero(
                    self.y[self.train_idx[n_initial:]][::-1] == 1)[0][0]
        except ValueError:
            last_inclusion = 0
#         last_inclusion = len(self.train_idx)-last_one_pos-1
        stats = {
            "n_included": np.count_nonzero(self.y[self.train_idx] == 1),
            "n_excluded": np.count_nonzero(self.y[self.train_idx] == 0),
            "n_papers": len(self.y),
            "n_reviewed": len(self.train_idx),
            "n_pool": self.n_pool(),
            "last_inclusion": last_inclusion,
            "n_initial": n_initial,
        }
        return stats

    def save_logs(self, *args, **kwargs):
        """Save the logs to a file."""

        self._logger.save(*args, **kwargs)

    def save(self, pickle_fp):
        """
        Dump the self object to a pickle fill (using dill). Keras models
        Cannot be dumped, so they are written to a separate h5 file. The
        model is briefly popped out of the object to allow the rest to be
        written to a file. Do not rely on this method for long term storage
        of the class, since library changes could easily break it. In those
        cases, use the log + h5 file instead.
        """
        if isinstance(self.model, KerasClassifier) and self.model_trained:
            model_fp = os.path.splitext(pickle_fp)[0] + ".h5"
            self.model.model.save(model_fp)
            current_model = self.model.__dict__.pop("model", None)
            with open(pickle_fp, "wb") as fp:
                dill.dump(self, fp)
            setattr(self.model, "model", current_model)
        else:
            dill.dump(self, fp)

    @classmethod
    def load(cls, pickle_fp):
        """
        Create a BaseReview object from a pickle file, and optiona h5 file.
        """
        with open(pickle_fp, "rb") as fp:
            my_instance = dill.load(fp)
        try:
            model_fp = os.path.splitext(pickle_fp)[0] + ".h5"
            current_model = load_model(model_fp)
            setattr(my_instance.model, "model", current_model)
        except Exception:
            pass
        return my_instance
Example #14
0
)
# map the intensity values against the grid
y_full = np.asarray([data[P[0], P[1]] for P in X_full])
X_pool = deepcopy(X_full)
y_pool = deepcopy(y_full)

# assembling initial training set
initial_idx = [0, im_height-1, im_height*(im_height-1), -1, im_width//2 + im_height//2*im_height]
X_train, y_train = X_pool[initial_idx], y_pool[initial_idx]

# create an ActiveLearner instance
learner = ActiveLearner(
    predictor=RandomForestClassifier(),
    X_initial=X_train, y_initial=y_train
)
initial_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)

n_queries = 100
for round_idx in range(n_queries):
    query_idx, query_inst = learner.query(X_pool)
    learner.teach(X_pool[query_idx].reshape(1, -1), y_pool[query_idx].reshape(-1, ))
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx)

final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_width)

# learning with randomly selected queries instead of active learning
random_idx = initial_idx + list(np.random.choice(range(len(X_full)), n_queries, replace=False))
X_train, y_train = X_full[initial_idx], y_full[initial_idx]
random_learner = ActiveLearner(
    predictor=RandomForestClassifier(),
Example #15
0
x_seed_vec = vec.transform(x_seed)
x_pool_vec = vec.transform(x_pool)

#Build learner
#pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english',ngram_range=(1,1),tokenizer=tokenize)),
#                    ('model',LogisticRegression(C=1))])

learner = ActiveLearner(estimator=LogisticRegression(C=10,solver='lbfgs'), X_training=x_seed_vec,
                        y_training=np.array(y_seed).reshape(len(y_seed),-1))

for n in range(5):
    
    query_idx, query_inst = learner.query(x_pool_vec)
    
    recipe = x_pool[query_idx].index.values[0]
    print(recipes[recipe]['recipe'])
    print('0/1?')
    response = int(input())
    print('\n')
    learner.teach(x_pool_vec[query_idx].reshape(1, -1), np.array(response).reshape(1, -1))
    
#Return recommeded recipe

pred = learner.predict_proba(x_pool_vec)[:,1]
max_pred = np.argmax(pred)
rec = x_pool.index[max_pred]

print('Your recommended recipe is:')
print(recipes[rec]['recipe'])