예제 #1
0
파일: triple.py 프로젝트: openefsa/asreview
    def __init__(self,
                 a=2.155,
                 alpha=0.94,
                 b=0.789,
                 beta=1.0,
                 c=0.835,
                 gamma=2.0,
                 shuffle=True,
                 random_state=None):
        """Initialize the triple balance strategy.

        Arguments
        ---------
        a: float
            Governs the weight of the 1's. Higher values mean linearly more 1's
            in your training sample.
        alpha: float
            Governs the scaling the weight of the 1's, as a function of the
            ratio of ones to zeros. A positive value means that the lower the
            ratio of zeros to ones, the higher the weight of the ones.
        b: float
            Governs how strongly we want to sample depending on the total
            number of samples. A value of 1 means no dependence on the total
            number of samples, while lower values mean increasingly stronger
            dependence on the number of samples.
        beta: float
            Governs the scaling of the weight of the zeros depending on the
            number of samples. Higher values means that larger samples are more
            strongly penalizing zeros.
        c: float
            Value between one and zero that governs the weight of samples done
            with maximal sampling. Higher values mean higher weight.
        gamma: float
            Governs the scaling of the weight of the max samples as a function
            of the % of papers read. Higher values mean stronger scaling.
        """
        super(TripleBalance, self).__init__()
        self.a = a
        self.alpha = alpha
        self.b = b
        self.beta = beta
        self.c = c
        self.gamma = gamma
        self.shuffle = shuffle
        self.fallback_model = DoubleBalance(a=a,
                                            alpha=alpha,
                                            b=b,
                                            beta=beta,
                                            random_state=random_state)
        self._random_state = get_random_state(random_state)
예제 #2
0
 def __init__(self,
              a=2.155,
              alpha=0.94,
              b=0.789,
              beta=1.0,
              c=0.835,
              gamma=2.0,
              shuffle=True,
              random_state=None):
     """Initialize the triple balance strategy."""
     super(TripleBalance, self).__init__()
     self.a = a
     self.alpha = alpha
     self.b = b
     self.beta = beta
     self.c = c
     self.gamma = gamma
     self.shuffle = shuffle
     self.fallback_model = DoubleBalance(a=a,
                                         alpha=alpha,
                                         b=b,
                                         beta=beta,
                                         random_state=random_state)
     self._random_state = get_random_state(random_state)
예제 #3
0
class TripleBalance(BaseBalance):
    """Triple balance strategy.

    Class to get the three way rebalancing function and arguments.
    It divides the data into three groups: 1's, 0's from random sampling,
    and 0's from max sampling. Thus it only makes sense to use this class in
    combination with the rand_max query strategy.

    Arguments
    ---------
    a: float
        Governs the weight of the 1's. Higher values mean linearly more 1's
        in your training sample.
    alpha: float
        Governs the scaling the weight of the 1's, as a function of the
        ratio of ones to zeros. A positive value means that the lower the
        ratio of zeros to ones, the higher the weight of the ones.
    b: float
        Governs how strongly we want to sample depending on the total
        number of samples. A value of 1 means no dependence on the total
        number of samples, while lower values mean increasingly stronger
        dependence on the number of samples.
    beta: float
        Governs the scaling of the weight of the zeros depending on the
        number of samples. Higher values means that larger samples are more
        strongly penalizing zeros.
    c: float
        Value between one and zero that governs the weight of samples done
        with maximal sampling. Higher values mean higher weight.
    gamma: float
        Governs the scaling of the weight of the max samples as a function
        of the % of papers read. Higher values mean stronger scaling.
    """

    name = "triple"

    def __init__(self,
                 a=2.155,
                 alpha=0.94,
                 b=0.789,
                 beta=1.0,
                 c=0.835,
                 gamma=2.0,
                 shuffle=True,
                 random_state=None):
        """Initialize the triple balance strategy."""
        super(TripleBalance, self).__init__()
        self.a = a
        self.alpha = alpha
        self.b = b
        self.beta = beta
        self.c = c
        self.gamma = gamma
        self.shuffle = shuffle
        self.fallback_model = DoubleBalance(a=a,
                                            alpha=alpha,
                                            b=b,
                                            beta=beta,
                                            random_state=random_state)
        self._random_state = get_random_state(random_state)

    def sample(self, X, y, train_idx, shared):
        """Resample the training data.

        Arguments
        ---------
        X: np.array
            Complete feature matrix.
        y: np.array
            Labels for all papers.
        train_idx: np.array
            Training indices, that is all papers that have been reviewed.
        shared: dict
            Dictionary to share data between balancing models and other models.

        Returns
        -------
        np.array, np.array:
            X_train, y_train: the resampled matrix, labels.
        """
        max_idx = np.array(shared["query_src"].get("max", []), dtype=np.int)
        rand_idx = np.array([], dtype=np.int)
        for qtype in shared["query_src"]:
            if qtype != "max":
                rand_idx = np.append(rand_idx, shared["query_src"][qtype])

        rand_idx = rand_idx.astype(int)
        # Write them back for next round.
        if self.shuffle:
            self._random_state.shuffle(rand_idx)
            self._random_state.shuffle(max_idx)

        if len(rand_idx) == 0 or len(max_idx) == 0:
            logging.debug("Warning: trying to use triple balance, but unable"
                          f" to, because we have {len(max_idx)} max samples "
                          f"and {len(rand_idx)} random samples.")
            return self.fallback_model.sample(X, y, train_idx, shared)

        # Split the idx into three groups: 1's, random 0's, max 0's.
        one_idx = train_idx[np.where(y[train_idx] == 1)]
        zero_max_idx = max_idx[np.where(y[max_idx] == 0)]
        zero_rand_idx = rand_idx[np.where(y[rand_idx] == 0)]

        if len(zero_rand_idx) == 0 or len(zero_max_idx) == 0:
            logging.debug("Warning: trying to use triple balance, but unable "
                          f"to, because we have {len(zero_max_idx)} zero max"
                          f"samples and {len(zero_rand_idx)} random samples.")
            return self.fallback_model.sample(X, y, train_idx, shared)

        n_one = len(one_idx)
        n_zero_rand = len(zero_rand_idx)
        n_zero_max = len(zero_max_idx)
        n_samples = len(y)
        n_train = len(train_idx)

        # Get the distribution of 1's, and random 0's and max 0's.
        n_one_train, n_zero_rand_train, n_zero_max_train = _get_triple_dist(
            n_one, n_zero_rand, n_zero_max, n_samples, n_train, self.a,
            self.alpha, self.b, self.beta, self.c, self.gamma,
            self._random_state)
        logging.debug(f"(1, 0_rand, 0_max) = ({n_one_train}, "
                      f"{n_zero_rand_train}, {n_zero_max_train})")

        one_train_idx = fill_training(one_idx, n_one_train, self._random_state)
        zero_rand_train_idx = fill_training(zero_rand_idx, n_zero_rand_train,
                                            self._random_state)
        zero_max_train_idx = fill_training(zero_max_idx, n_zero_max_train,
                                           self._random_state)

        all_idx = np.concatenate(
            [one_train_idx, zero_rand_train_idx, zero_max_train_idx])
        self._random_state.shuffle(all_idx)

        return X[all_idx], y[all_idx]

    def full_hyper_space(self):
        from hyperopt import hp
        parameter_space = {
            "bal_a": hp.lognormal("bal_a", 0, 1),
            "bal_alpha": hp.uniform("bal_alpha", 0, 2),
            "bal_b": hp.uniform("bal_b", 0, 1),
            # "bal_zero_beta": hp.uniform("bal_zero_beta", 0, 2),
            "bal_c": hp.uniform("bal_c", 0, 1),
            # "bal_zero_max_gamma": hp.uniform("bal_zero_max_gamma", 0.01, 2)
        }
        return parameter_space, {}