Beispiel #1
0
    def __init__(self,
                 cluster_size=350,
                 update_interval=200,
                 random_state=None,
                 **kwargs):
        """Initialize the clustering strategy.

        Arguments
        ---------
        texts: list
            List of sequences to create feature matrix.
        cluster_size: int
            Size of the clusters to be made. If the size of the clusters is
            smaller than the size of the pool, fall back to max sampling.
        update_cluster: int
            Update the clustering every x instances.
        **kwargs: dict
            Keyword arguments for the doc2vec feature model.
        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()
        self._random_state = get_random_state(random_state)
Beispiel #2
0
    def __init__(self,
                 cluster_size=350,
                 update_interval=200,
                 random_state=None):
        """Initialize the clustering strategy.

        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()
        self._random_state = get_random_state(random_state)
Beispiel #3
0
    def __init__(self,
                 cluster_size=350,
                 update_interval=200,
                 random_state=None):
        """Initialize the clustering strategy.

        Arguments
        ---------
        cluster_size: int
            Size of the clusters to be made. If the size of the clusters is
            smaller than the size of the pool, fall back to max sampling.
        update_interval: int
            Update the clustering every x instances.
        random_state: int, RandomState
            State/seed of the RNG.
        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()
        self._random_state = get_random_state(random_state)
    def __init__(
        self,
        X,
        y=None,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        prior_included=[],
        prior_excluded=[],
        log_file=None,
        final_labels=None,
        verbose=1,
        data_fp=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        X: np.array
            The feature matrix for the current dataset.
        y: np.array
            Labels of each paper, 1 for included, 0 for excluded.
            Can be set to None, to indicate inclusion data is not available.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        prior_included: list
            List of papers (ids) that are included a priori.
        prior_excluded: list
            List of papers (ids) that are excluded a priori.
        log_file: str
            Path to log file.
        final_labels: np.array
            Final labels if we're using a two step inclusion process.
            For example, if at one step a paper is considered after reading the
            abstract and then at the second step, a final decision is made on
            the basis of the full text.
        """
        super(BaseReview, self).__init__()

        self.X = X
        self.y = y
        if y is None:
            self.y = np.full(X.shape[0], NOT_AVAILABLE)
        self.y = np.array(self.y, dtype=np.int)
        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            raise ValueError("Supply feature model!")

        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.log_file = log_file
        self.verbose = verbose

        self.prior_included = prior_included
        self.prior_excluded = prior_excluded

        self.query_i = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False
        self.data_fp = data_fp

        with open_logger(log_file) as logger:
            if not logger.is_empty():
                y, train_idx, query_src, query_i = logger.review_state()
                if X.shape[0] != len(y):
                    raise ValueError("The log file does not correspond to the "
                                     "given data file, please use another log "
                                     "file or dataset.")
                self.y = y
                self.train_idx = train_idx
                self.shared["query_src"] = query_src
                self.query_i = query_i
            else:
                if final_labels is not None:
                    logger.set_final_labels(final_labels)
                logger.set_labels(self.y)
                logger.add_settings(self.settings)
                self._prior_knowledge(logger)
                self.query_i = 0
Beispiel #5
0
class ClusterQuery(ProbaQueryStrategy):
    "Query strategy using clustering algorithms."
    name = "cluster"

    def __init__(self, cluster_size=350, update_interval=200, **kwargs):
        """Initialize the clustering strategy.

        Arguments
        ---------
        texts: list
            List of sequences to create feature matrix.
        cluster_size: int
            Size of the clusters to be made. If the size of the clusters is
            smaller than the size of the pool, fall back to max sampling.
        update_cluster: int
            Update the clustering every x instances.
        **kwargs: dict
            Keyword arguments for the doc2vec feature model.
        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()

    def _query(self, X, pool_idx, n_instances, proba):
        n_samples = X.shape[0]
        if pool_idx is None:
            pool_idx = np.arange(n_samples)

        last_update = self.last_update
        if (last_update is None or self.update_interval is None
                or last_update - len(pool_idx) >= self.update_interval):
            n_clusters = round(len(pool_idx) / self.cluster_size)
            if n_clusters <= 1:
                return self.fallback_model._query(X,
                                                  pool_idx=pool_idx,
                                                  n_instances=n_instances,
                                                  proba=proba)
            model = KMeans(n_clusters=n_clusters, n_init=1)
            self.clusters = model.fit_predict(X)
            self.last_update = len(pool_idx)

        clusters = {}
        for idx in pool_idx:
            cluster_id = self.clusters[idx]
            if cluster_id in clusters:
                clusters[cluster_id].append((idx, proba[idx, 1]))
            else:
                clusters[cluster_id] = [(idx, proba[idx, 1])]

        for cluster_id in clusters:
            try:
                clusters[cluster_id] = sorted(clusters[cluster_id],
                                              key=lambda x: x[1])
            except ValueError:
                raise

        clust_idx = []
        cluster_ids = list(clusters)
        for _ in range(n_instances):
            cluster_id = np.random.choice(cluster_ids, 1)[0]
            clust_idx.append(clusters[cluster_id].pop()[0])
            if len(clusters[cluster_id]) == 0:
                del clusters[cluster_id]
                cluster_ids = list(clusters)

        clust_idx = np.array(clust_idx, dtype=int)

        return clust_idx, X[clust_idx]

    def full_hyper_space(self):
        from hyperopt import hp
        parameter_space = {
            "qry_cluster_size": hp.quniform('qry_cluster_size', 50, 1000, 1),
            "qry_update_cluster": hp.quniform('qry_update_cluster', 100, 300,
                                              1),
            "qry_max_frac": hp.uniform('qry_max_frac', 0, 1),
        }
        return parameter_space, {}
Beispiel #6
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
    ):
        """Initialize base class for systematic reviews."""
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        # Restore the state from a file or initialize said file.
        with open_state(self.state_file) as state:
            # From file
            if not state.is_empty():
                startup = state.startup_vals()
                # If there are start indices not in the training add them.
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            # From scratch
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            # Try to retrieve feature matrix from the state file.
            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
Beispiel #7
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
        #                  final_labels=None,
        verbose=1,
        data_fp=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        X: np.array
            The feature matrix for the current dataset.
        y: np.array
            Labels of each paper, 1 for included, 0 for excluded.
            Can be set to None, to indicate inclusion data is not available.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        prior_included: list
            List of papers (ids) that are included a priori.
        prior_excluded: list
            List of papers (ids) that are excluded a priori.
        state_file: str
            Path to state file. Replaces log_file argument.
        final_labels: np.array
            Final labels if we're using a two step inclusion process.
            For example, if at one step a paper is considered after reading the
            abstract and then at the second step, a final decision is made on
            the basis of the full text.
        """
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file
        self.verbose = verbose

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False
        self.data_fp = data_fp

        with open_state(self.state_file) as state:
            if not state.is_empty():
                startup = state.startup_vals()
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
Beispiel #8
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        as_data: asreview.ASReviewData
            The data object which contains the text, labels, etc.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        feature_model: BaseFeatureModel
            Feature extraction model that converts texts and keywords to
            feature matrices.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        start_idx: numpy.array
            Start the simulation/review with these indices. They are assumed to
            be already labeled. Failing to do so might result bad behaviour.
        state_file: str
            Path to state file. Replaces log_file argument.
        """
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        # Restore the state from a file or initialize said file.
        with open_state(self.state_file) as state:
            # From file
            if not state.is_empty():
                startup = state.startup_vals()
                # If there are start indices not in the training add them.
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            # From scratch
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            # Try to retrieve feature matrix from the state file.
            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
Beispiel #9
0
class ClusterQuery(ProbaQueryStrategy):
    """Query strategy using clustering algorithms.

    Use clustering after feature extraction on the dataset. Then the highest
    probabilities within random clusters are sampled.

    Arguments
    ---------
    cluster_size: int
        Size of the clusters to be made. If the size of the clusters is
        smaller than the size of the pool, fall back to max sampling.
    update_interval: int
        Update the clustering every x instances.
    random_state: int, RandomState
        State/seed of the RNG.
    """

    name = "cluster"

    def __init__(self,
                 cluster_size=350,
                 update_interval=200,
                 random_state=None):
        """Initialize the clustering strategy.

        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()
        self._random_state = get_random_state(random_state)

    def _query(self, X, pool_idx, n_instances, proba):
        n_samples = X.shape[0]
        if pool_idx is None:
            pool_idx = np.arange(n_samples)

        last_update = self.last_update
        if (last_update is None or self.update_interval is None
                or last_update - len(pool_idx) >= self.update_interval):
            n_clusters = round(len(pool_idx) / self.cluster_size)
            if n_clusters <= 1:
                return self.fallback_model._query(X,
                                                  pool_idx=pool_idx,
                                                  n_instances=n_instances,
                                                  proba=proba)
            model = KMeans(n_clusters=n_clusters,
                           n_init=1,
                           random_state=self._random_state)
            self.clusters = model.fit_predict(X)
            self.last_update = len(pool_idx)

        clusters = {}
        for idx in pool_idx:
            cluster_id = self.clusters[idx]
            if cluster_id in clusters:
                clusters[cluster_id].append((idx, proba[idx, 1]))
            else:
                clusters[cluster_id] = [(idx, proba[idx, 1])]

        for cluster_id in clusters:
            try:
                clusters[cluster_id] = sorted(clusters[cluster_id],
                                              key=lambda x: x[1])
            except ValueError:
                raise

        clust_idx = []
        cluster_ids = list(clusters)
        for _ in range(n_instances):
            cluster_id = self._random_state.choice(cluster_ids, 1)[0]
            clust_idx.append(clusters[cluster_id].pop()[0])
            if len(clusters[cluster_id]) == 0:
                del clusters[cluster_id]
                cluster_ids = list(clusters)

        clust_idx = np.array(clust_idx, dtype=int)

        return clust_idx, X[clust_idx]

    def full_hyper_space(self):
        from hyperopt import hp
        parameter_space = {
            "qry_cluster_size": hp.quniform('qry_cluster_size', 50, 1000, 1),
            "qry_update_interval": hp.quniform('qry_update_interval', 100, 300,
                                               1),
        }
        return parameter_space, {}