def __init__(self, strategy_1="max", strategy_2="random", mix_ratio=0.95, random_state=None, **kwargs): """Initialize the Mixed query strategy.""" super(MixedQuery, self).__init__() kwargs_1 = {} kwargs_2 = {} for key, value in kwargs.items(): if key.startswith(strategy_1): new_key = key[len(strategy_1) + 1:] kwargs_1[new_key] = value elif key.starts_with(strategy_2): new_key = key[len(strategy_2) + 1:] kwargs_2[new_key] = value else: logging.warn(f"Key {key} is being ignored for the mixed " "({strategy_1}, {strategy_2}) query strategy.") self.strategy_1 = strategy_1 self.strategy_2 = strategy_2 self.query_model1 = get_query_model(strategy_1, **kwargs_1) self.query_model2 = get_query_model(strategy_2, **kwargs_2) self._random_state = get_random_state(random_state) if "random_state" in self.query_model1.default_param: self.query_model1 = get_query_model( strategy_1, **kwargs_1, random_state=self._random_state) if "random_state" in self.query_model2.default_param: self.query_model2 = get_query_model( strategy_2, **kwargs_2, random_state=self._random_state) self.mix_ratio = mix_ratio
def __init__(self, cluster_size=350, update_interval=200, random_state=None, **kwargs): """Initialize the clustering strategy. Arguments --------- texts: list List of sequences to create feature matrix. cluster_size: int Size of the clusters to be made. If the size of the clusters is smaller than the size of the pool, fall back to max sampling. update_cluster: int Update the clustering every x instances. **kwargs: dict Keyword arguments for the doc2vec feature model. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() self._random_state = get_random_state(random_state)
def sample_prior_knowledge(labels, n_prior_included=10, n_prior_excluded=10, random_state=None): """Function to sample prelabelled articles. Arguments --------- labels: np.ndarray Labels in a 2d numpy array (the result of keras.utils.to_categorical). n_included: int The number of positive labels. n_excluded: int The number of negative labels. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- np.ndarray: An array with n_included and n_excluded indices. """ # set random state r = get_random_state(random_state) # retrieve the index of included and excluded papers included_idx = np.where(labels == 1)[0] excluded_idx = np.where(labels == 0)[0] if len(included_idx) < n_prior_included: raise ValueError( f"Number of included priors requested ({n_prior_included})" f" is bigger than number of included papers " f"({len(included_idx)}).") if len(excluded_idx) < n_prior_excluded: raise ValueError( f"Number of excluded priors requested ({n_prior_excluded})" f" is bigger than number of excluded papers " f"({len(excluded_idx)}).") # select randomly from included and excluded papers included_indexes_sample = r.choice(included_idx, n_prior_included, replace=False) excluded_indexes_sample = r.choice(excluded_idx, n_prior_excluded, replace=False) init = np.append(included_indexes_sample, excluded_indexes_sample) return init
def __init__(self, ratio=1.0, random_state=None): """Initialize the undersampling balance strategy. Arguments --------- ratio: double Undersampling ratio of the zero's. If for example we set a ratio of 0.25, we would sample only a quarter of the zeros and all the ones. """ super(UndersampleBalance, self).__init__() self.ratio = ratio self._random_state = get_random_state(random_state)
def __init__(self, *args, embedding_fp=None, random_state=None, **kwargs): """Initialize the Embedding-Idf model Arguments --------- embedding_fp: str Path to embedding. """ super(EmbeddingIdf, self).__init__(*args, **kwargs) self.embedding_fp = embedding_fp self.embedding = None self._random_state = get_random_state(random_state)
def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0, random_state=None): super(DoubleBalance, self).__init__() self.a = a self.alpha = alpha self.b = b self.beta = beta self.fallback_model = SimpleBalance() self._random_state = get_random_state(random_state)
def __init__(self, cluster_size=350, update_interval=200, random_state=None): """Initialize the clustering strategy. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() self._random_state = get_random_state(random_state)
def __init__(self, strategy_1="max", strategy_2="random", mix_ratio=0.95, random_state=None, **kwargs): """Initialize the Mixed query strategy Arguments --------- strategy_1: str Name of the first query strategy. strategy_2: str Name of the second query strategy. mix_ratio: float Portion of queries done by the first strategy. So a mix_ratio of 0.95 means that 95% of the time query strategy 1 is used and 5% of the time query strategy 2. **kwargs: dict Keyword arguments for the two strategy. To specify which of the strategies the argument is for, prepend with the name of the query strategy and an underscore, e.g. 'max_' for maximal sampling. """ super(MixedQuery, self).__init__() kwargs_1 = {} kwargs_2 = {} for key, value in kwargs.items(): if key.startswith(strategy_1): new_key = key[len(strategy_1) + 1:] kwargs_1[new_key] = value elif key.starts_with(strategy_2): new_key = key[len(strategy_2) + 1:] kwargs_2[new_key] = value else: logging.warn(f"Key {key} is being ignored for the mixed " "({strategy_1}, {strategy_2}) query strategy.") self.strategy_1 = strategy_1 self.strategy_2 = strategy_2 self.query_model1 = get_query_model(strategy_1, **kwargs_1) self.query_model2 = get_query_model(strategy_2, **kwargs_2) self._random_state = get_random_state(random_state) if "random_state" in self.query_model1.default_param: self.query_model1 = get_query_model( strategy_1, **kwargs_1, random_state=self._random_state) if "random_state" in self.query_model2.default_param: self.query_model2 = get_query_model( strategy_2, **kwargs_2, random_state=self._random_state) self.mix_ratio = mix_ratio
def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0, c=0.835, gamma=2.0, shuffle=True, random_state=None): """Initialize the triple balance strategy. Arguments --------- a: float Governs the weight of the 1's. Higher values mean linearly more 1's in your training sample. alpha: float Governs the scaling the weight of the 1's, as a function of the ratio of ones to zeros. A positive value means that the lower the ratio of zeros to ones, the higher the weight of the ones. b: float Governs how strongly we want to sample depending on the total number of samples. A value of 1 means no dependence on the total number of samples, while lower values mean increasingly stronger dependence on the number of samples. beta: float Governs the scaling of the weight of the zeros depending on the number of samples. Higher values means that larger samples are more strongly penalizing zeros. c: float Value between one and zero that governs the weight of samples done with maximal sampling. Higher values mean higher weight. gamma: float Governs the scaling of the weight of the max samples as a function of the % of papers read. Higher values mean stronger scaling. """ super(TripleBalance, self).__init__() self.a = a self.alpha = alpha self.b = b self.beta = beta self.c = c self.gamma = gamma self.shuffle = shuffle self.fallback_model = DoubleBalance(a=a, alpha=alpha, b=b, beta=beta, random_state=random_state) self._random_state = get_random_state(random_state)
def __init__(self, cluster_size=350, update_interval=200, random_state=None): """Initialize the clustering strategy. Arguments --------- cluster_size: int Size of the clusters to be made. If the size of the clusters is smaller than the size of the pool, fall back to max sampling. update_interval: int Update the clustering every x instances. random_state: int, RandomState State/seed of the RNG. """ super(ClusterQuery, self).__init__() self.cluster_size = cluster_size self.update_interval = update_interval self.last_update = None self.fallback_model = MaxQuery() self._random_state = get_random_state(random_state)
def __init__(self, a=2.155, alpha=0.94, b=0.789, beta=1.0, c=0.835, gamma=2.0, shuffle=True, random_state=None): """Initialize the triple balance strategy.""" super(TripleBalance, self).__init__() self.a = a self.alpha = alpha self.b = b self.beta = beta self.c = c self.gamma = gamma self.shuffle = shuffle self.fallback_model = DoubleBalance(a=a, alpha=alpha, b=b, beta=beta, random_state=random_state) self._random_state = get_random_state(random_state)
def get_reviewer(dataset, mode="simulate", model=DEFAULT_MODEL, query_strategy=DEFAULT_QUERY_STRATEGY, balance_strategy=DEFAULT_BALANCE_STRATEGY, feature_extraction=DEFAULT_FEATURE_EXTRACTION, n_instances=DEFAULT_N_INSTANCES, n_papers=None, n_queries=None, embedding_fp=None, verbose=0, prior_idx=None, prior_record_id=None, n_prior_included=DEFAULT_N_PRIOR_INCLUDED, n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED, config_file=None, state_file=None, model_param=None, query_param=None, balance_param=None, feature_param=None, seed=None, included_dataset=[], excluded_dataset=[], prior_dataset=[], new=False, **kwargs): """Get a review object from arguments. See __main__.py for a description of the arguments. """ as_data = create_as_data(dataset, included_dataset, excluded_dataset, prior_dataset, new=new) if len(as_data) == 0: raise ValueError("Supply at least one dataset" " with at least one record.") cli_settings = ASReviewSettings(model=model, n_instances=n_instances, n_queries=n_queries, n_papers=n_papers, n_prior_included=n_prior_included, n_prior_excluded=n_prior_excluded, query_strategy=query_strategy, balance_strategy=balance_strategy, feature_extraction=feature_extraction, mode=mode, data_fp=None) cli_settings.from_file(config_file) if state_file is not None: with open_state(state_file) as state: if state.is_empty(): state.settings = cli_settings settings = state.settings else: settings = cli_settings if n_queries is not None: settings.n_queries = n_queries if n_papers is not None: settings.n_papers = n_papers if model_param is not None: settings.model_param = model_param if query_param is not None: settings.query_param = query_param if balance_param is not None: settings.balance_param = balance_param if feature_param is not None: settings.feature_param = feature_param # Check if mode is valid if mode in AVAILABLE_REVIEW_CLASSES: logging.info(f"Start review in '{mode}' mode.") else: raise ValueError(f"Unknown mode '{mode}'.") logging.debug(settings) # Initialize models. random_state = get_random_state(seed) train_model = get_classifier(settings.model, **settings.model_param, random_state=random_state) query_model = get_query_model(settings.query_strategy, **settings.query_param, random_state=random_state) balance_model = get_balance_model(settings.balance_strategy, **settings.balance_param, random_state=random_state) feature_model = get_feature_model(settings.feature_extraction, **settings.feature_param, random_state=random_state) # LSTM models need embedding matrices. if train_model.name.startswith("lstm-"): texts = as_data.texts train_model.embedding_matrix = feature_model.get_embedding_matrix( texts, embedding_fp) # prior knowledge if prior_idx is not None and prior_record_id is not None and \ len(prior_idx) > 0 and len(prior_record_id) > 0: raise ValueError( "Not possible to provide both prior_idx and prior_record_id") if prior_record_id is not None and len(prior_record_id) > 0: prior_idx = convert_id_to_idx(as_data, prior_record_id) # Initialize the review class. if mode == "simulate": reviewer = ReviewSimulate(as_data, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, prior_idx=prior_idx, n_prior_included=settings.n_prior_included, n_prior_excluded=settings.n_prior_excluded, state_file=state_file, **kwargs) elif mode == "minimal": reviewer = MinimalReview(as_data, model=train_model, query_model=query_model, balance_model=balance_model, feature_model=feature_model, n_papers=settings.n_papers, n_instances=settings.n_instances, n_queries=settings.n_queries, state_file=state_file, **kwargs) else: raise ValueError("Error finding mode, should never come here...") return reviewer
def __init__(self, random_state=None): super(RandomQuery, self).__init__() self._random_state = get_random_state(random_state)
def __init__(self, ratio=1.0, random_state=None): """Initialize the undersampling balance strategy.""" super(UndersampleBalance, self).__init__() self.ratio = ratio self._random_state = get_random_state(random_state)