def probability_matrix_from_h5_state(state_fp): """Get the probability matrix from an .h5 state file. Arguments ---------- state_fp: str Path to state file. Returns ------- pandas.DataFrame: A dataframe of shape (num_papers, num_queries), with in (i,j) the probability that paper i was relevant according to the model at query j. Note that the row index starts at 0, but the column index starts at 1. """ proba_dict = {} with open_state(state_fp, read_only=True) as state: queries = [int(num) for num in state.f['results'].keys()] total_queries = max(queries) for i in range(1, total_queries+1): proba_dict[i] = state.f[f'results/{i}/proba'][:] proba_matrix = pd.DataFrame.from_dict(proba_dict) return proba_matrix
def create_time_list(state_fp): """Create a list of creation times from a state file. Arguments ---------- state_fp: str Path to state file Returns ------- list List of creation time of each query in the state_file results, in datetime format '%Y-%m-%d %H:%M:%S.%f'. Note that the first entry is the creation time of the 0'th query, not the 'start_time' of the state file. Similarly, the last entry is the creation time of the last query, not the 'end_time' of the state file. """ time_list = [] with open_state(state_fp) as state: for i in range(len(state.f['results'])): time = state.f[f'results/{i}'].attrs['creation_time'] time = time.decode('UTF-8') time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S.%f') time_list.append(time) return time_list
def review(self, *args, **kwargs): """Do the systematic review, writing the results to the state file. Arguments --------- stop_after_class: bool When to stop; if True stop after classification step, otherwise stop after training step. instant_save: bool If True, save results after each single classification. """ with open_state(self.state_file) as state: self._do_review(state, *args, **kwargs)
def get_reviewer(dataset, mode="simulate", model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, feature_extraction=DEFAULT_FEATURE_EXTRACTION, n_instances=DEFAULT_N_INSTANCES, n_papers=None, n_queries=None, embedding_fp=None, verbose=0, prior_idx=None, prior_record_id=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, state_file=None, model_param=None, query_param=None, balance_param=None, feature_param=None, seed=None, included_dataset=[], excluded_dataset=[], prior_dataset=[], new=False, **kwargs): """Get a review object from arguments. See __main__.py for a description of the arguments. """ as_data = create_as_data(dataset, included_dataset, excluded_dataset, prior_dataset, new=new) if len(as_data) == 0: raise ValueError("Supply at least one dataset" " with at least one record.") cli_settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_papers=n_papers, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, feature_extraction=feature_extraction, mode=mode, data_fp=None) cli_settings.from_file(config_file) if state_file is not None: with open_state(state_file) as state: if state.is_empty(): state.settings = cli_settings settings = state.settings else: settings = cli_settings if n_queries is not None: settings.n_queries = n_queries if n_papers is not None: settings.n_papers = n_papers if model_param is not None: settings.model_param = model_param if query_param is not None: settings.query_param = query_param if balance_param is not None: settings.balance_param = balance_param if feature_param is not None: settings.feature_param = feature_param # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: logging.info(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") logging.debug(settings) # Initialize models. random_state = get_random_state(seed) train_model = get_classifier(settings.model, **settings.model_param, random_state=random_state) query_model = get_query_model(settings.query_strategy, **settings.query_param, random_state=random_state) balance_model = get_balance_model(settings.balance_strategy, **settings.balance_param, random_state=random_state) feature_model = get_feature_model(settings.feature_extraction, **settings.feature_param, random_state=random_state) # LSTM models need embedding matrices. if train_model.name.startswith("lstm-"): texts = as_data.texts train_model.embedding_matrix = feature_model.get_embedding_matrix( texts, embedding_fp) # prior knowledge if prior_idx is not None and prior_record_id is not None and \ len(prior_idx) > 0 and len(prior_record_id) > 0: raise ValueError( "Not possible to provide both prior_idx and prior_record_id") if prior_record_id is not None and len(prior_record_id) > 0: prior_idx = convert_id_to_idx(as_data, prior_record_id) # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(as_data, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, prior_idx=prior_idx, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, state_file=state_file, **kwargs) elif mode == "minimal": reviewer = MinimalReview(as_data, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, state_file=state_file, **kwargs) else: raise ValueError("Error finding mode, should never come here...") return reviewer
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock(lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = get_label_train_history(state) diff_history = get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_idx = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify(query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) proba = state.pred_proba.tolist() with SQLiteLock(lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: current_pool = read_pool(project_id) in_current_pool = np.zeros(len(as_data)) in_current_pool[current_pool] = 1 new_pool = [x for x in new_query_idx if in_current_pool[x]] write_pool(project_id, new_pool) write_proba(project_id, proba)
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, ): """Initialize base class for systematic reviews.""" super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NaiveBayesClassifier() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False # Restore the state from a file or initialize said file. with open_state(self.state_file) as state: # From file if not state.is_empty(): startup = state.startup_vals() # If there are start indices not in the training add them. if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] # From scratch else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) # Try to retrieve feature matrix from the state file. try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, # final_labels=None, verbose=1, data_fp=None, ): """ Initialize base class for systematic reviews. Arguments --------- X: np.array The feature matrix for the current dataset. y: np.array Labels of each paper, 1 for included, 0 for excluded. Can be set to None, to indicate inclusion data is not available. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. prior_included: list List of papers (ids) that are included a priori. prior_excluded: list List of papers (ids) that are excluded a priori. state_file: str Path to state file. Replaces log_file argument. final_labels: np.array Final labels if we're using a two step inclusion process. For example, if at one step a paper is considered after reading the abstract and then at the second step, a final decision is made on the basis of the full text. """ super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.verbose = verbose self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.data_fp = data_fp with open_state(self.state_file) as state: if not state.is_empty(): startup = state.startup_vals() if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
def open_logger(*args, **kwargs): warnings.warn("open_logger will be replaced by open_state.", category=FutureWarning) return open_state(*args, **kwargs)
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, ): """ Initialize base class for systematic reviews. Arguments --------- as_data: asreview.ASReviewData The data object which contains the text, labels, etc. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. feature_model: BaseFeatureModel Feature extraction model that converts texts and keywords to feature matrices. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. start_idx: numpy.array Start the simulation/review with these indices. They are assumed to be already labeled. Failing to do so might result bad behaviour. state_file: str Path to state file. Replaces log_file argument. """ super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False # Restore the state from a file or initialize said file. with open_state(self.state_file) as state: # From file if not state.is_empty(): startup = state.startup_vals() # If there are start indices not in the training add them. if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] # From scratch else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) # Try to retrieve feature matrix from the state file. try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
def train_model(project_id, label_method=None): """Add the new labels to the review and do the modeling. It uses a lock to ensure only one model is running at the same time. Old results directories are deleted after 4 iterations. It has one argument on the CLI, which is the base project directory. """ logging.info(f"Project {project_id} - Train a new model for project") # get file locations asr_kwargs_file = get_kwargs_path(project_id) lock_file = get_lock_path(project_id) # Lock so that only one training run is running at the same time. # It doesn't lock the flask server/client. with SQLiteLock( lock_file, blocking=False, lock_name="training", project_id=project_id) as lock: # If the lock is not acquired, another training instance is running. if not lock.locked(): logging.info("Project {project_id} - " "Cannot acquire lock, other instance running.") return # Lock the current state. We want to have a consistent active state. # This does communicate with the flask backend; it prevents writing and # reading to the same files at the same time. with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # Get the all labels since last run. If no new labels, quit. new_label_history = read_label_history(project_id) data_fp = str(get_data_file_path(project_id)) as_data = read_data(project_id) state_file = get_state_path(project_id) # collect command line arguments and pass them to the reviewer with open(asr_kwargs_file, "r") as fp: asr_kwargs = json.load(fp) try: del asr_kwargs["abstract_only"] except KeyError: pass asr_kwargs['state_file'] = str(state_file) reviewer = get_reviewer(dataset=data_fp, mode="minimal", **asr_kwargs) with open_state(state_file) as state: old_label_history = _get_label_train_history(state) diff_history = _get_diff_history(new_label_history, old_label_history) if len(diff_history) == 0: logging.info( "Project {project_id} - No new labels since last run.") return query_record_ids = np.array([x[0] for x in diff_history], dtype=int) inclusions = np.array([x[1] for x in diff_history], dtype=int) query_idx = convert_id_to_idx(as_data, query_record_ids) # Classify the new labels, train and store the results. with open_state(state_file) as state: reviewer.classify( query_idx, inclusions, state, method=label_method) reviewer.train() reviewer.log_probabilities(state) new_query_idx = reviewer.query(reviewer.n_pool()).tolist() reviewer.log_current_query(state) # write the proba to a pandas dataframe with record_ids as index proba = pd.DataFrame( {"proba": state.pred_proba.tolist()}, index=pd.Index(as_data.record_ids, name="record_id") ) # update the pool and output the proba's # important: pool is sorted on query with SQLiteLock( lock_file, blocking=True, lock_name="active", project_id=project_id) as lock: # read the pool current_pool = read_pool(project_id) # diff pool and new_query_ind current_pool_idx = convert_id_to_idx(as_data, current_pool) current_pool_idx = frozenset(current_pool_idx) new_pool_idx = [x for x in new_query_idx if x in current_pool_idx] # convert new_pool_idx back to record_ids new_pool = convert_idx_to_id(as_data, new_pool_idx) # write the pool and proba write_pool(project_id, new_pool) write_proba(project_id, proba)