Beispiel #1
0
    def __init__(self,
                 strategy_1="max",
                 strategy_2="random",
                 mix_ratio=0.95,
                 random_state=None,
                 **kwargs):
        """Initialize the Mixed query strategy."""
        super(MixedQuery, self).__init__()
        kwargs_1 = {}
        kwargs_2 = {}
        for key, value in kwargs.items():
            if key.startswith(strategy_1):
                new_key = key[len(strategy_1) + 1:]
                kwargs_1[new_key] = value
            elif key.starts_with(strategy_2):
                new_key = key[len(strategy_2) + 1:]
                kwargs_2[new_key] = value
            else:
                logging.warn(f"Key {key} is being ignored for the mixed "
                             "({strategy_1}, {strategy_2}) query strategy.")

        self.strategy_1 = strategy_1
        self.strategy_2 = strategy_2

        self.query_model1 = get_query_model(strategy_1, **kwargs_1)
        self.query_model2 = get_query_model(strategy_2, **kwargs_2)

        self._random_state = get_random_state(random_state)
        if "random_state" in self.query_model1.default_param:
            self.query_model1 = get_query_model(
                strategy_1, **kwargs_1, random_state=self._random_state)
        if "random_state" in self.query_model2.default_param:
            self.query_model2 = get_query_model(
                strategy_2, **kwargs_2, random_state=self._random_state)
        self.mix_ratio = mix_ratio
Beispiel #2
0
    def __init__(self,
                 cluster_size=350,
                 update_interval=200,
                 random_state=None,
                 **kwargs):
        """Initialize the clustering strategy.

        Arguments
        ---------
        texts: list
            List of sequences to create feature matrix.
        cluster_size: int
            Size of the clusters to be made. If the size of the clusters is
            smaller than the size of the pool, fall back to max sampling.
        update_cluster: int
            Update the clustering every x instances.
        **kwargs: dict
            Keyword arguments for the doc2vec feature model.
        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()
        self._random_state = get_random_state(random_state)
Beispiel #3
0
def sample_prior_knowledge(labels,
                           n_prior_included=10,
                           n_prior_excluded=10,
                           random_state=None):
    """Function to sample prelabelled articles.

    Arguments
    ---------
    labels: np.ndarray
        Labels in a 2d numpy array (the result of
        keras.utils.to_categorical).
    n_included: int
        The number of positive labels.
    n_excluded: int
        The number of negative labels.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    np.ndarray:
        An array with n_included and n_excluded indices.

    """
    # set random state
    r = get_random_state(random_state)

    # retrieve the index of included and excluded papers
    included_idx = np.where(labels == 1)[0]
    excluded_idx = np.where(labels == 0)[0]

    if len(included_idx) < n_prior_included:
        raise ValueError(
            f"Number of included priors requested ({n_prior_included})"
            f" is bigger than number of included papers "
            f"({len(included_idx)}).")
    if len(excluded_idx) < n_prior_excluded:
        raise ValueError(
            f"Number of excluded priors requested ({n_prior_excluded})"
            f" is bigger than number of excluded papers "
            f"({len(excluded_idx)}).")
    # select randomly from included and excluded papers
    included_indexes_sample = r.choice(included_idx,
                                       n_prior_included,
                                       replace=False)
    excluded_indexes_sample = r.choice(excluded_idx,
                                       n_prior_excluded,
                                       replace=False)

    init = np.append(included_indexes_sample, excluded_indexes_sample)

    return init
Beispiel #4
0
    def __init__(self, ratio=1.0, random_state=None):
        """Initialize the undersampling balance strategy.

        Arguments
        ---------
        ratio: double
            Undersampling ratio of the zero's. If for example we set a ratio of
            0.25, we would sample only a quarter of the zeros and all the ones.
        """
        super(UndersampleBalance, self).__init__()
        self.ratio = ratio
        self._random_state = get_random_state(random_state)
Beispiel #5
0
    def __init__(self, *args, embedding_fp=None, random_state=None, **kwargs):
        """Initialize the Embedding-Idf model

        Arguments
        ---------
        embedding_fp: str
            Path to embedding.
        """
        super(EmbeddingIdf, self).__init__(*args, **kwargs)
        self.embedding_fp = embedding_fp
        self.embedding = None
        self._random_state = get_random_state(random_state)
Beispiel #6
0
 def __init__(self,
              a=2.155,
              alpha=0.94,
              b=0.789,
              beta=1.0,
              random_state=None):
     super(DoubleBalance, self).__init__()
     self.a = a
     self.alpha = alpha
     self.b = b
     self.beta = beta
     self.fallback_model = SimpleBalance()
     self._random_state = get_random_state(random_state)
Beispiel #7
0
    def __init__(self,
                 cluster_size=350,
                 update_interval=200,
                 random_state=None):
        """Initialize the clustering strategy.

        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()
        self._random_state = get_random_state(random_state)
Beispiel #8
0
    def __init__(self,
                 strategy_1="max",
                 strategy_2="random",
                 mix_ratio=0.95,
                 random_state=None,
                 **kwargs):
        """Initialize the Mixed query strategy

        Arguments
        ---------
        strategy_1: str
            Name of the first query strategy.
        strategy_2: str
            Name of the second query strategy.
        mix_ratio: float
            Portion of queries done by the first strategy. So a mix_ratio of
            0.95 means that 95% of the time query strategy 1 is used and 5% of
            the time query strategy 2.
        **kwargs: dict
            Keyword arguments for the two strategy. To specify which of the
            strategies the argument is for, prepend with the name of the query
            strategy and an underscore, e.g. 'max_' for maximal sampling.
        """
        super(MixedQuery, self).__init__()
        kwargs_1 = {}
        kwargs_2 = {}
        for key, value in kwargs.items():
            if key.startswith(strategy_1):
                new_key = key[len(strategy_1) + 1:]
                kwargs_1[new_key] = value
            elif key.starts_with(strategy_2):
                new_key = key[len(strategy_2) + 1:]
                kwargs_2[new_key] = value
            else:
                logging.warn(f"Key {key} is being ignored for the mixed "
                             "({strategy_1}, {strategy_2}) query strategy.")

        self.strategy_1 = strategy_1
        self.strategy_2 = strategy_2

        self.query_model1 = get_query_model(strategy_1, **kwargs_1)
        self.query_model2 = get_query_model(strategy_2, **kwargs_2)

        self._random_state = get_random_state(random_state)
        if "random_state" in self.query_model1.default_param:
            self.query_model1 = get_query_model(
                strategy_1, **kwargs_1, random_state=self._random_state)
        if "random_state" in self.query_model2.default_param:
            self.query_model2 = get_query_model(
                strategy_2, **kwargs_2, random_state=self._random_state)
        self.mix_ratio = mix_ratio
Beispiel #9
0
    def __init__(self,
                 a=2.155,
                 alpha=0.94,
                 b=0.789,
                 beta=1.0,
                 c=0.835,
                 gamma=2.0,
                 shuffle=True,
                 random_state=None):
        """Initialize the triple balance strategy.

        Arguments
        ---------
        a: float
            Governs the weight of the 1's. Higher values mean linearly more 1's
            in your training sample.
        alpha: float
            Governs the scaling the weight of the 1's, as a function of the
            ratio of ones to zeros. A positive value means that the lower the
            ratio of zeros to ones, the higher the weight of the ones.
        b: float
            Governs how strongly we want to sample depending on the total
            number of samples. A value of 1 means no dependence on the total
            number of samples, while lower values mean increasingly stronger
            dependence on the number of samples.
        beta: float
            Governs the scaling of the weight of the zeros depending on the
            number of samples. Higher values means that larger samples are more
            strongly penalizing zeros.
        c: float
            Value between one and zero that governs the weight of samples done
            with maximal sampling. Higher values mean higher weight.
        gamma: float
            Governs the scaling of the weight of the max samples as a function
            of the % of papers read. Higher values mean stronger scaling.
        """
        super(TripleBalance, self).__init__()
        self.a = a
        self.alpha = alpha
        self.b = b
        self.beta = beta
        self.c = c
        self.gamma = gamma
        self.shuffle = shuffle
        self.fallback_model = DoubleBalance(a=a,
                                            alpha=alpha,
                                            b=b,
                                            beta=beta,
                                            random_state=random_state)
        self._random_state = get_random_state(random_state)
Beispiel #10
0
    def __init__(self,
                 cluster_size=350,
                 update_interval=200,
                 random_state=None):
        """Initialize the clustering strategy.

        Arguments
        ---------
        cluster_size: int
            Size of the clusters to be made. If the size of the clusters is
            smaller than the size of the pool, fall back to max sampling.
        update_interval: int
            Update the clustering every x instances.
        random_state: int, RandomState
            State/seed of the RNG.
        """
        super(ClusterQuery, self).__init__()
        self.cluster_size = cluster_size
        self.update_interval = update_interval
        self.last_update = None
        self.fallback_model = MaxQuery()
        self._random_state = get_random_state(random_state)
Beispiel #11
0
 def __init__(self,
              a=2.155,
              alpha=0.94,
              b=0.789,
              beta=1.0,
              c=0.835,
              gamma=2.0,
              shuffle=True,
              random_state=None):
     """Initialize the triple balance strategy."""
     super(TripleBalance, self).__init__()
     self.a = a
     self.alpha = alpha
     self.b = b
     self.beta = beta
     self.c = c
     self.gamma = gamma
     self.shuffle = shuffle
     self.fallback_model = DoubleBalance(a=a,
                                         alpha=alpha,
                                         b=b,
                                         beta=beta,
                                         random_state=random_state)
     self._random_state = get_random_state(random_state)
Beispiel #12
0
def get_reviewer(dataset,
                 mode="simulate",
                 model=DEFAULT_MODEL,
                 query_strategy=DEFAULT_QUERY_STRATEGY,
                 balance_strategy=DEFAULT_BALANCE_STRATEGY,
                 feature_extraction=DEFAULT_FEATURE_EXTRACTION,
                 n_instances=DEFAULT_N_INSTANCES,
                 n_papers=None,
                 n_queries=None,
                 embedding_fp=None,
                 verbose=0,
                 prior_idx=None,
                 prior_record_id=None,
                 n_prior_included=DEFAULT_N_PRIOR_INCLUDED,
                 n_prior_excluded=DEFAULT_N_PRIOR_EXCLUDED,
                 config_file=None,
                 state_file=None,
                 model_param=None,
                 query_param=None,
                 balance_param=None,
                 feature_param=None,
                 seed=None,
                 included_dataset=[],
                 excluded_dataset=[],
                 prior_dataset=[],
                 new=False,
                 **kwargs):
    """Get a review object from arguments.

    See __main__.py for a description of the arguments.
    """
    as_data = create_as_data(dataset,
                             included_dataset,
                             excluded_dataset,
                             prior_dataset,
                             new=new)

    if len(as_data) == 0:
        raise ValueError("Supply at least one dataset"
                         " with at least one record.")

    cli_settings = ASReviewSettings(model=model,
                                    n_instances=n_instances,
                                    n_queries=n_queries,
                                    n_papers=n_papers,
                                    n_prior_included=n_prior_included,
                                    n_prior_excluded=n_prior_excluded,
                                    query_strategy=query_strategy,
                                    balance_strategy=balance_strategy,
                                    feature_extraction=feature_extraction,
                                    mode=mode,
                                    data_fp=None)
    cli_settings.from_file(config_file)

    if state_file is not None:
        with open_state(state_file) as state:
            if state.is_empty():
                state.settings = cli_settings
            settings = state.settings
    else:
        settings = cli_settings

    if n_queries is not None:
        settings.n_queries = n_queries
    if n_papers is not None:
        settings.n_papers = n_papers

    if model_param is not None:
        settings.model_param = model_param
    if query_param is not None:
        settings.query_param = query_param
    if balance_param is not None:
        settings.balance_param = balance_param
    if feature_param is not None:
        settings.feature_param = feature_param

    # Check if mode is valid
    if mode in AVAILABLE_REVIEW_CLASSES:
        logging.info(f"Start review in '{mode}' mode.")
    else:
        raise ValueError(f"Unknown mode '{mode}'.")
    logging.debug(settings)

    # Initialize models.
    random_state = get_random_state(seed)
    train_model = get_classifier(settings.model,
                                 **settings.model_param,
                                 random_state=random_state)
    query_model = get_query_model(settings.query_strategy,
                                  **settings.query_param,
                                  random_state=random_state)
    balance_model = get_balance_model(settings.balance_strategy,
                                      **settings.balance_param,
                                      random_state=random_state)
    feature_model = get_feature_model(settings.feature_extraction,
                                      **settings.feature_param,
                                      random_state=random_state)

    # LSTM models need embedding matrices.
    if train_model.name.startswith("lstm-"):
        texts = as_data.texts
        train_model.embedding_matrix = feature_model.get_embedding_matrix(
            texts, embedding_fp)

    # prior knowledge
    if prior_idx is not None and prior_record_id is not None and \
            len(prior_idx) > 0 and len(prior_record_id) > 0:
        raise ValueError(
            "Not possible to provide both prior_idx and prior_record_id")
    if prior_record_id is not None and len(prior_record_id) > 0:
        prior_idx = convert_id_to_idx(as_data, prior_record_id)

    # Initialize the review class.
    if mode == "simulate":
        reviewer = ReviewSimulate(as_data,
                                  model=train_model,
                                  query_model=query_model,
                                  balance_model=balance_model,
                                  feature_model=feature_model,
                                  n_papers=settings.n_papers,
                                  n_instances=settings.n_instances,
                                  n_queries=settings.n_queries,
                                  prior_idx=prior_idx,
                                  n_prior_included=settings.n_prior_included,
                                  n_prior_excluded=settings.n_prior_excluded,
                                  state_file=state_file,
                                  **kwargs)
    elif mode == "minimal":
        reviewer = MinimalReview(as_data,
                                 model=train_model,
                                 query_model=query_model,
                                 balance_model=balance_model,
                                 feature_model=feature_model,
                                 n_papers=settings.n_papers,
                                 n_instances=settings.n_instances,
                                 n_queries=settings.n_queries,
                                 state_file=state_file,
                                 **kwargs)
    else:
        raise ValueError("Error finding mode, should never come here...")

    return reviewer
Beispiel #13
0
 def __init__(self, random_state=None):
     super(RandomQuery, self).__init__()
     self._random_state = get_random_state(random_state)
Beispiel #14
0
 def __init__(self, ratio=1.0, random_state=None):
     """Initialize the undersampling balance strategy."""
     super(UndersampleBalance, self).__init__()
     self.ratio = ratio
     self._random_state = get_random_state(random_state)