def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0): super(DoubleBalance, self).__init__() self.a = a self.alpha = alpha self.b = b self.beta = beta self.fallback_model = SimpleBalance()
def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0, random_state=None): super(DoubleBalance, self).__init__() self.a = a self.alpha = alpha self.b = b self.beta = beta self.fallback_model = SimpleBalance() self._random_state = get_random_state(random_state)
def __init__( self, X, y=None, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, prior_included=[], prior_excluded=[], log_file=None, final_labels=None, verbose=1, data_fp=None, ): """ Initialize base class for systematic reviews. Arguments --------- X: np.array The feature matrix for the current dataset. y: np.array Labels of each paper, 1 for included, 0 for excluded. Can be set to None, to indicate inclusion data is not available. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. prior_included: list List of papers (ids) that are included a priori. prior_excluded: list List of papers (ids) that are excluded a priori. log_file: str Path to log file. final_labels: np.array Final labels if we're using a two step inclusion process. For example, if at one step a paper is considered after reading the abstract and then at the second step, a final decision is made on the basis of the full text. """ super(BaseReview, self).__init__() self.X = X self.y = y if y is None: self.y = np.full(X.shape[0], NOT_AVAILABLE) self.y = np.array(self.y, dtype=np.int) # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: raise ValueError("Supply feature model!") self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.log_file = log_file self.verbose = verbose self.prior_included = prior_included self.prior_excluded = prior_excluded self.query_i = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.data_fp = data_fp with open_logger(log_file) as logger: if not logger.is_empty(): y, train_idx, query_src, query_i = logger.review_state() if X.shape[0] != len(y): raise ValueError("The log file does not correspond to the " "given data file, please use another log " "file or dataset.") self.y = y self.train_idx = train_idx self.shared["query_src"] = query_src self.query_i = query_i else: if final_labels is not None: logger.set_final_labels(final_labels) logger.set_labels(self.y) logger.add_settings(self.settings) self._prior_knowledge(logger) self.query_i = 0
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, ): """Initialize base class for systematic reviews.""" super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False # Restore the state from a file or initialize said file. with open_state(self.state_file) as state: # From file if not state.is_empty(): startup = state.startup_vals() # If there are start indices not in the training add them. if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] # From scratch else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) # Try to retrieve feature matrix from the state file. try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
class DoubleBalance(BaseBalance): """Class for the double balance strategy. Class to get the two way rebalancing function and arguments. It super samples ones depending on the number of 0's and total number of samples in the training data. Arguments --------- a: float Governs the weight of the 1's. Higher values mean linearly more 1's in your training sample. alpha: float Governs the scaling the weight of the 1's, as a function of the ratio of ones to zeros. A positive value means that the lower the ratio of zeros to ones, the higher the weight of the ones. b: float Governs how strongly we want to sample depending on the total number of samples. A value of 1 means no dependence on the total number of samples, while lower values mean increasingly stronger dependence on the number of samples. beta: float Governs the scaling of the weight of the zeros depending on the number of samples. Higher values means that larger samples are more strongly penalizing zeros. """ name = "double" def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0): super(DoubleBalance, self).__init__() self.a = a self.alpha = alpha self.b = b self.beta = beta self.fallback_model = SimpleBalance() def sample(self, X, y, train_idx, shared): one_idx = train_idx[np.where(y[train_idx] == 1)] zero_idx = train_idx[np.where(y[train_idx] == 0)] # Fall back to simple sampling if we have only ones or zeros. if len(one_idx) == 0 or len(zero_idx) == 0: self.fallback_model.sample(X, y, train_idx, shared) n_one = len(one_idx) n_zero = len(zero_idx) n_train = n_one + n_zero # Compute the weights. one_weight = _one_weight(n_one, n_zero, self.a, self.alpha) zero_weight = _zero_weight(n_one + n_zero, self.b, self.beta) tot_zo_weight = one_weight * n_one + zero_weight * n_zero n_one_train = random_round( one_weight * n_one * n_train / tot_zo_weight) n_one_train = max(1, min(n_train - 2, n_one_train)) n_zero_train = n_train - n_one_train # Get random ones and zeros. one_train_idx = fill_training(one_idx, n_one_train) zero_train_idx = fill_training(zero_idx, n_zero_train) all_idx = np.concatenate([one_train_idx, zero_train_idx]) np.random.shuffle(all_idx) return X[all_idx], y[all_idx] def full_hyper_space(self): from hyperopt import hp parameter_space = { "bal_a": hp.lognormal("bal_a", 0, 1), "bal_alpha": hp.uniform("bal_alpha", 0, 2), "bal_b": hp.uniform("bal_b", 0, 1), # "bal_beta": hp.uniform("bal_beta", 0, 2), } return parameter_space, {}
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, # final_labels=None, verbose=1, data_fp=None, ): """ Initialize base class for systematic reviews. Arguments --------- X: np.array The feature matrix for the current dataset. y: np.array Labels of each paper, 1 for included, 0 for excluded. Can be set to None, to indicate inclusion data is not available. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. prior_included: list List of papers (ids) that are included a priori. prior_excluded: list List of papers (ids) that are excluded a priori. state_file: str Path to state file. Replaces log_file argument. final_labels: np.array Final labels if we're using a two step inclusion process. For example, if at one step a paper is considered after reading the abstract and then at the second step, a final decision is made on the basis of the full text. """ super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.verbose = verbose self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False self.data_fp = data_fp with open_state(self.state_file) as state: if not state.is_empty(): startup = state.startup_vals() if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)
class DoubleBalance(BaseBalance): """Dynamic Resampling balance strategy. Class to get the two way rebalancing function and arguments. It super samples ones depending on the number of 0's and total number of samples in the training data. Arguments --------- a: float Governs the weight of the 1's. Higher values mean linearly more 1's in your training sample. alpha: float Governs the scaling the weight of the 1's, as a function of the ratio of ones to zeros. A positive value means that the lower the ratio of zeros to ones, the higher the weight of the ones. b: float Governs how strongly we want to sample depending on the total number of samples. A value of 1 means no dependence on the total number of samples, while lower values mean increasingly stronger dependence on the number of samples. beta: float Governs the scaling of the weight of the zeros depending on the number of samples. Higher values means that larger samples are more strongly penalizing zeros. """ name = "double" def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0, random_state=None): super(DoubleBalance, self).__init__() self.a = a self.alpha = alpha self.b = b self.beta = beta self.fallback_model = SimpleBalance() self._random_state = get_random_state(random_state) def sample(self, X, y, train_idx, shared): """Resample the training data. Arguments --------- X: np.array Complete feature matrix. y: np.array Labels for all papers. train_idx: np.array Training indices, that is all papers that have been reviewed. shared: dict Dictionary to share data between balancing models and other models. Returns ------- np.array, np.array: X_train, y_train: the resampled matrix, labels. """ # Get inclusions and exclusions one_idx = train_idx[np.where(y[train_idx] == 1)] zero_idx = train_idx[np.where(y[train_idx] == 0)] # Fall back to simple sampling if we have only ones or zeroes. if len(one_idx) == 0 or len(zero_idx) == 0: self.fallback_model.sample(X, y, train_idx, shared) n_one = len(one_idx) n_zero = len(zero_idx) n_train = n_one + n_zero # Compute sampling weights. one_weight = _one_weight(n_one, n_zero, self.a, self.alpha) zero_weight = _zero_weight(n_one + n_zero, self.b, self.beta) tot_zo_weight = one_weight * n_one + zero_weight * n_zero # Number of inclusions to sample. n_one_train = random_round( one_weight * n_one * n_train / tot_zo_weight, self._random_state) # Should be at least 1, and at least two spots should be for exclusions. n_one_train = max(1, min(n_train - 2, n_one_train)) # Number of exclusions to sample n_zero_train = n_train - n_one_train # Sample records of ones and zeroes one_train_idx = fill_training(one_idx, n_one_train, self._random_state) zero_train_idx = fill_training(zero_idx, n_zero_train, self._random_state) # Merge and shuffle. all_idx = np.concatenate([one_train_idx, zero_train_idx]) self._random_state.shuffle(all_idx) # Return resampled feature matrix and labels. return X[all_idx], y[all_idx] def full_hyper_space(self): from hyperopt import hp parameter_space = { "bal_a": hp.lognormal("bal_a", 0, 1), "bal_alpha": hp.uniform("bal_alpha", 0, 2), "bal_b": hp.uniform("bal_b", 0, 1), # "bal_beta": hp.uniform("bal_beta", 0, 2), } return parameter_space, {}
def __init__( self, as_data, model=None, query_model=None, balance_model=None, feature_model=None, n_papers=None, n_instances=DEFAULT_N_INSTANCES, n_queries=None, start_idx=[], state_file=None, log_file=None, ): """ Initialize base class for systematic reviews. Arguments --------- as_data: asreview.ASReviewData The data object which contains the text, labels, etc. model: BaseModel Initialized model to fit the data during active learning. See asreview.models.utils.py for possible models. query_model: BaseQueryModel Initialized model to query new instances for review, such as random sampling or max sampling. See asreview.query_strategies.utils.py for query models. balance_model: BaseBalanceModel Initialized model to redistribute the training data during the active learning process. They might either resample or undersample specific papers. feature_model: BaseFeatureModel Feature extraction model that converts texts and keywords to feature matrices. n_papers: int Number of papers to review during the active learning process, excluding the number of initial priors. To review all papers, set n_papers to None. n_instances: int Number of papers to query at each step in the active learning process. n_queries: int Number of steps/queries to perform. Set to None for no limit. start_idx: numpy.array Start the simulation/review with these indices. They are assumed to be already labeled. Failing to do so might result bad behaviour. state_file: str Path to state file. Replaces log_file argument. """ super(BaseReview, self).__init__() # Default to Naive Bayes model if model is None: model = NBModel() if query_model is None: query_model = MaxQuery() if balance_model is None: balance_model = SimpleBalance() if feature_model is None: feature_model = Tfidf() self.as_data = as_data self.y = as_data.labels if self.y is None: self.y = np.full(len(as_data), LABEL_NA) self.model = model self.balance_model = balance_model self.query_model = query_model self.feature_model = feature_model self.shared = {"query_src": {}, "current_queries": {}} self.model.shared = self.shared self.query_model.shared = self.shared self.balance_model.shared = self.shared self.n_papers = n_papers self.n_instances = n_instances self.n_queries = n_queries self.start_idx = start_idx if log_file is not None: warnings.warn( "The log_file argument for BaseReview will be" " replaced by state_file.", category=FutureWarning) self.state_file = log_file else: self.state_file = state_file self.query_i = 0 self.query_i_classified = 0 self.train_idx = np.array([], dtype=np.int) self.model_trained = False # Restore the state from a file or initialize said file. with open_state(self.state_file) as state: # From file if not state.is_empty(): startup = state.startup_vals() # If there are start indices not in the training add them. if not set(startup["train_idx"]) >= set(start_idx): new_idx = list(set(start_idx) - set(startup["train_idx"])) self.classify(new_idx, self.y[new_idx], state, method="initial") startup = state.startup_vals() self.train_idx = startup["train_idx"] self.y = startup["labels"] self.shared["query_src"] = startup["query_src"] self.query_i = startup["query_i"] self.query_i_classified = startup["query_i_classified"] # From scratch else: state.set_labels(self.y) state.settings = self.settings self.classify(start_idx, self.y[start_idx], state, method="initial") self.query_i_classified = len(start_idx) # Try to retrieve feature matrix from the state file. try: self.X = state.get_feature_matrix(as_data.hash()) except KeyError: self.X = feature_model.fit_transform(as_data.texts, as_data.headings, as_data.bodies, as_data.keywords) state._add_as_data(as_data, feature_matrix=self.X) if self.X.shape[0] != len(self.y): raise ValueError("The state file does not correspond to the " "given data file, please use another state " "file or dataset.") self.load_current_query(state)