def probability_matrix_from_h5_state(state_fp):
    """Get the probability matrix from an .h5 state file.

    Arguments
    ----------
    state_fp: str
        Path to state file.

    Returns
    -------
    pandas.DataFrame:
        A dataframe of shape (num_papers, num_queries), with in (i,j) the probability
        that paper i was relevant according to the model at query j. Note that the row
        index starts at 0, but the column index starts at 1.
    """
    proba_dict = {}
    with open_state(state_fp, read_only=True) as state:
        queries = [int(num) for num in state.f['results'].keys()]
        total_queries = max(queries)

        for i in range(1, total_queries+1):
            proba_dict[i] = state.f[f'results/{i}/proba'][:]

    proba_matrix = pd.DataFrame.from_dict(proba_dict)
    return proba_matrix
예제 #2
0
def create_time_list(state_fp):
    """Create a list of creation times from a state file.

    Arguments
    ----------
    state_fp: str
        Path to state file

    Returns
    -------
    list
        List of creation time of each query in the state_file results, in
        datetime format '%Y-%m-%d %H:%M:%S.%f'. Note that the first entry is
        the creation time of the 0'th query, not the 'start_time' of the state
        file. Similarly, the last entry is the creation time of the last query,
        not the 'end_time' of the state file.
    """
    time_list = []
    with open_state(state_fp) as state:
        for i in range(len(state.f['results'])):
            time = state.f[f'results/{i}'].attrs['creation_time']
            time = time.decode('UTF-8')
            time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S.%f')
            time_list.append(time)
    return time_list
예제 #3
0
    def review(self, *args, **kwargs):
        """Do the systematic review, writing the results to the state file.

        Arguments
        ---------
        stop_after_class: bool
            When to stop; if True stop after classification step, otherwise
            stop after training step.
        instant_save: bool
            If True, save results after each single classification.
        """
        with open_state(self.state_file) as state:
            self._do_review(state, *args, **kwargs)
예제 #4
0
def get_reviewer(dataset,
                 mode="simulate",
                 model=DEFAULT_MODEL,
                 query_strategy=DEFAULT_QUERY_STRATEGY,
                 balance_strategy=DEFAULT_BALANCE_STRATEGY,
                 feature_extraction=DEFAULT_FEATURE_EXTRACTION,
                 n_instances=DEFAULT_N_INSTANCES,
                 n_papers=None,
                 n_queries=None,
                 embedding_fp=None,
                 verbose=0,
                 prior_idx=None,
                 prior_record_id=None,
                 n_prior_included=DEFAULT_N_PRIOR_INCLUDED,
                 n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED,
                 config_file=None,
                 state_file=None,
                 model_param=None,
                 query_param=None,
                 balance_param=None,
                 feature_param=None,
                 seed=None,
                 included_dataset=[],
                 excluded_dataset=[],
                 prior_dataset=[],
                 new=False,
                 **kwargs):
    """Get a review object from arguments.

    See __main__.py for a description of the arguments.
    """
    as_data = create_as_data(dataset,
                             included_dataset,
                             excluded_dataset,
                             prior_dataset,
                             new=new)

    if len(as_data) == 0:
        raise ValueError("Supply at least one dataset"
                         " with at least one record.")

    cli_settings = ASReviewSettings(model=model,
                                    n_instances=n_instances,
                                    n_queries=n_queries,
                                    n_papers=n_papers,
                                    n_prior_included=n_prior_included,
                                    n_prior_excluded=n_prior_excluded,
                                    query_strategy=query_strategy,
                                    balance_strategy=balance_strategy,
                                    feature_extraction=feature_extraction,
                                    mode=mode,
                                    data_fp=None)
    cli_settings.from_file(config_file)

    if state_file is not None:
        with open_state(state_file) as state:
            if state.is_empty():
                state.settings = cli_settings
            settings = state.settings
    else:
        settings = cli_settings

    if n_queries is not None:
        settings.n_queries = n_queries
    if n_papers is not None:
        settings.n_papers = n_papers

    if model_param is not None:
        settings.model_param = model_param
    if query_param is not None:
        settings.query_param = query_param
    if balance_param is not None:
        settings.balance_param = balance_param
    if feature_param is not None:
        settings.feature_param = feature_param

    # Check if mode is valid
    if mode in AVAILABLE_REVIEW_CLASSES:
        logging.info(f"Start review in '{mode}' mode.")
    else:
        raise ValueError(f"Unknown mode '{mode}'.")
    logging.debug(settings)

    # Initialize models.
    random_state = get_random_state(seed)
    train_model = get_classifier(settings.model,
                                 **settings.model_param,
                                 random_state=random_state)
    query_model = get_query_model(settings.query_strategy,
                                  **settings.query_param,
                                  random_state=random_state)
    balance_model = get_balance_model(settings.balance_strategy,
                                      **settings.balance_param,
                                      random_state=random_state)
    feature_model = get_feature_model(settings.feature_extraction,
                                      **settings.feature_param,
                                      random_state=random_state)

    # LSTM models need embedding matrices.
    if train_model.name.startswith("lstm-"):
        texts = as_data.texts
        train_model.embedding_matrix = feature_model.get_embedding_matrix(
            texts, embedding_fp)

    # prior knowledge
    if prior_idx is not None and prior_record_id is not None and \
            len(prior_idx) > 0 and len(prior_record_id) > 0:
        raise ValueError(
            "Not possible to provide both prior_idx and prior_record_id")
    if prior_record_id is not None and len(prior_record_id) > 0:
        prior_idx = convert_id_to_idx(as_data, prior_record_id)

    # Initialize the review class.
    if mode == "simulate":
        reviewer = ReviewSimulate(as_data,
                                  model=train_model,
                                  query_model=query_model,
                                  balance_model=balance_model,
                                  feature_model=feature_model,
                                  n_papers=settings.n_papers,
                                  n_instances=settings.n_instances,
                                  n_queries=settings.n_queries,
                                  prior_idx=prior_idx,
                                  n_prior_included=settings.n_prior_included,
                                  n_prior_excluded=settings.n_prior_excluded,
                                  state_file=state_file,
                                  **kwargs)
    elif mode == "minimal":
        reviewer = MinimalReview(as_data,
                                 model=train_model,
                                 query_model=query_model,
                                 balance_model=balance_model,
                                 feature_model=feature_model,
                                 n_papers=settings.n_papers,
                                 n_instances=settings.n_instances,
                                 n_queries=settings.n_queries,
                                 state_file=state_file,
                                 **kwargs)
    else:
        raise ValueError("Error finding mode, should never come here...")

    return reviewer
예제 #5
0
def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(lock_file,
                    blocking=False,
                    lock_name="training",
                    project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)
        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = get_label_train_history(state)

        diff_history = get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_idx = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(query_idx,
                              inclusions,
                              state,
                              method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)
            proba = state.pred_proba.tolist()

        with SQLiteLock(lock_file,
                        blocking=True,
                        lock_name="active",
                        project_id=project_id) as lock:
            current_pool = read_pool(project_id)
            in_current_pool = np.zeros(len(as_data))
            in_current_pool[current_pool] = 1
            new_pool = [x for x in new_query_idx if in_current_pool[x]]
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)
예제 #6
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
    ):
        """Initialize base class for systematic reviews."""
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NaiveBayesClassifier()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        # Restore the state from a file or initialize said file.
        with open_state(self.state_file) as state:
            # From file
            if not state.is_empty():
                startup = state.startup_vals()
                # If there are start indices not in the training add them.
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            # From scratch
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            # Try to retrieve feature matrix from the state file.
            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
예제 #7
0
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
        #                  final_labels=None,
        verbose=1,
        data_fp=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        X: np.array
            The feature matrix for the current dataset.
        y: np.array
            Labels of each paper, 1 for included, 0 for excluded.
            Can be set to None, to indicate inclusion data is not available.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        prior_included: list
            List of papers (ids) that are included a priori.
        prior_excluded: list
            List of papers (ids) that are excluded a priori.
        state_file: str
            Path to state file. Replaces log_file argument.
        final_labels: np.array
            Final labels if we're using a two step inclusion process.
            For example, if at one step a paper is considered after reading the
            abstract and then at the second step, a final decision is made on
            the basis of the full text.
        """
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file
        self.verbose = verbose

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False
        self.data_fp = data_fp

        with open_state(self.state_file) as state:
            if not state.is_empty():
                startup = state.startup_vals()
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
예제 #8
0
def open_logger(*args, **kwargs):
    warnings.warn("open_logger will be replaced by open_state.",
                  category=FutureWarning)
    return open_state(*args, **kwargs)
예제 #9
0
파일: base.py 프로젝트: openefsa/asreview
    def __init__(
        self,
        as_data,
        model=None,
        query_model=None,
        balance_model=None,
        feature_model=None,
        n_papers=None,
        n_instances=DEFAULT_N_INSTANCES,
        n_queries=None,
        start_idx=[],
        state_file=None,
        log_file=None,
    ):
        """ Initialize base class for systematic reviews.

        Arguments
        ---------
        as_data: asreview.ASReviewData
            The data object which contains the text, labels, etc.
        model: BaseModel
            Initialized model to fit the data during active learning.
            See asreview.models.utils.py for possible models.
        query_model: BaseQueryModel
            Initialized model to query new instances for review, such as random
            sampling or max sampling.
            See asreview.query_strategies.utils.py for query models.
        balance_model: BaseBalanceModel
            Initialized model to redistribute the training data during the
            active learning process. They might either resample or undersample
            specific papers.
        feature_model: BaseFeatureModel
            Feature extraction model that converts texts and keywords to
            feature matrices.
        n_papers: int
            Number of papers to review during the active learning process,
            excluding the number of initial priors. To review all papers, set
            n_papers to None.
        n_instances: int
            Number of papers to query at each step in the active learning
            process.
        n_queries: int
            Number of steps/queries to perform. Set to None for no limit.
        start_idx: numpy.array
            Start the simulation/review with these indices. They are assumed to
            be already labeled. Failing to do so might result bad behaviour.
        state_file: str
            Path to state file. Replaces log_file argument.
        """
        super(BaseReview, self).__init__()

        # Default to Naive Bayes model
        if model is None:
            model = NBModel()
        if query_model is None:
            query_model = MaxQuery()
        if balance_model is None:
            balance_model = SimpleBalance()
        if feature_model is None:
            feature_model = Tfidf()

        self.as_data = as_data
        self.y = as_data.labels
        if self.y is None:
            self.y = np.full(len(as_data), LABEL_NA)
        self.model = model
        self.balance_model = balance_model
        self.query_model = query_model
        self.feature_model = feature_model

        self.shared = {"query_src": {}, "current_queries": {}}
        self.model.shared = self.shared
        self.query_model.shared = self.shared
        self.balance_model.shared = self.shared

        self.n_papers = n_papers
        self.n_instances = n_instances
        self.n_queries = n_queries
        self.start_idx = start_idx

        if log_file is not None:
            warnings.warn(
                "The log_file argument for BaseReview will be"
                " replaced by state_file.",
                category=FutureWarning)
            self.state_file = log_file
        else:
            self.state_file = state_file

        self.query_i = 0
        self.query_i_classified = 0
        self.train_idx = np.array([], dtype=np.int)
        self.model_trained = False

        # Restore the state from a file or initialize said file.
        with open_state(self.state_file) as state:
            # From file
            if not state.is_empty():
                startup = state.startup_vals()
                # If there are start indices not in the training add them.
                if not set(startup["train_idx"]) >= set(start_idx):
                    new_idx = list(set(start_idx) - set(startup["train_idx"]))
                    self.classify(new_idx,
                                  self.y[new_idx],
                                  state,
                                  method="initial")
                    startup = state.startup_vals()
                self.train_idx = startup["train_idx"]
                self.y = startup["labels"]
                self.shared["query_src"] = startup["query_src"]
                self.query_i = startup["query_i"]
                self.query_i_classified = startup["query_i_classified"]
            # From scratch
            else:
                state.set_labels(self.y)
                state.settings = self.settings
                self.classify(start_idx,
                              self.y[start_idx],
                              state,
                              method="initial")
                self.query_i_classified = len(start_idx)

            # Try to retrieve feature matrix from the state file.
            try:
                self.X = state.get_feature_matrix(as_data.hash())
            except KeyError:
                self.X = feature_model.fit_transform(as_data.texts,
                                                     as_data.headings,
                                                     as_data.bodies,
                                                     as_data.keywords)
                state._add_as_data(as_data, feature_matrix=self.X)
            if self.X.shape[0] != len(self.y):
                raise ValueError("The state file does not correspond to the "
                                 "given data file, please use another state "
                                 "file or dataset.")
            self.load_current_query(state)
예제 #10
0
def train_model(project_id, label_method=None):
    """Add the new labels to the review and do the modeling.

    It uses a lock to ensure only one model is running at the same time.
    Old results directories are deleted after 4 iterations.

    It has one argument on the CLI, which is the base project directory.
    """

    logging.info(f"Project {project_id} - Train a new model for project")

    # get file locations
    asr_kwargs_file = get_kwargs_path(project_id)
    lock_file = get_lock_path(project_id)

    # Lock so that only one training run is running at the same time.
    # It doesn't lock the flask server/client.
    with SQLiteLock(
            lock_file, blocking=False, lock_name="training",
            project_id=project_id) as lock:

        # If the lock is not acquired, another training instance is running.
        if not lock.locked():
            logging.info("Project {project_id} - "
                         "Cannot acquire lock, other instance running.")
            return

        # Lock the current state. We want to have a consistent active state.
        # This does communicate with the flask backend; it prevents writing and
        # reading to the same files at the same time.
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:
            # Get the all labels since last run. If no new labels, quit.
            new_label_history = read_label_history(project_id)

        data_fp = str(get_data_file_path(project_id))
        as_data = read_data(project_id)
        state_file = get_state_path(project_id)

        # collect command line arguments and pass them to the reviewer
        with open(asr_kwargs_file, "r") as fp:
            asr_kwargs = json.load(fp)

        try:
            del asr_kwargs["abstract_only"]
        except KeyError:
            pass

        asr_kwargs['state_file'] = str(state_file)
        reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs)

        with open_state(state_file) as state:
            old_label_history = _get_label_train_history(state)

        diff_history = _get_diff_history(new_label_history, old_label_history)

        if len(diff_history) == 0:
            logging.info(
                "Project {project_id} - No new labels since last run.")
            return

        query_record_ids = np.array([x[0] for x in diff_history], dtype=int)
        inclusions = np.array([x[1] for x in diff_history], dtype=int)

        query_idx = convert_id_to_idx(as_data, query_record_ids)

        # Classify the new labels, train and store the results.
        with open_state(state_file) as state:
            reviewer.classify(
                query_idx, inclusions, state, method=label_method)
            reviewer.train()
            reviewer.log_probabilities(state)
            new_query_idx = reviewer.query(reviewer.n_pool()).tolist()
            reviewer.log_current_query(state)

            # write the proba to a pandas dataframe with record_ids as index
            proba = pd.DataFrame(
                {"proba": state.pred_proba.tolist()},
                index=pd.Index(as_data.record_ids, name="record_id")
            )

        # update the pool and output the proba's
        # important: pool is sorted on query
        with SQLiteLock(
                lock_file,
                blocking=True,
                lock_name="active",
                project_id=project_id) as lock:

            # read the pool
            current_pool = read_pool(project_id)

            # diff pool and new_query_ind
            current_pool_idx = convert_id_to_idx(as_data, current_pool)
            current_pool_idx = frozenset(current_pool_idx)
            new_pool_idx = [x for x in new_query_idx if x in current_pool_idx]

            # convert new_pool_idx back to record_ids
            new_pool = convert_idx_to_id(as_data, new_pool_idx)

            # write the pool and proba
            write_pool(project_id, new_pool)
            write_proba(project_id, proba)