コード例 #1
0
def compute_sde_on_bins(y_pred,
                        mask,
                        bin_indices,
                        target_efficiencies,
                        power=2.,
                        sample_weight=None):
    # ignoring events from other classes
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    y_pred = y_pred[mask]
    bin_indices = bin_indices[mask]
    sample_weight = sample_weight[mask]

    bin_weights = compute_bin_weights(bin_indices=bin_indices,
                                      sample_weight=sample_weight)
    cuts = compute_cut_for_efficiency(target_efficiencies,
                                      mask=numpy.ones(len(y_pred), dtype=bool),
                                      y_pred=y_pred,
                                      sample_weight=sample_weight)

    result = 0.
    for cut in cuts:
        bin_efficiencies = compute_bin_efficiencies(
            y_pred,
            bin_indices=bin_indices,
            cut=cut,
            sample_weight=sample_weight)
        result += weighted_deviation(bin_efficiencies,
                                     weights=bin_weights,
                                     power=power)

    return (result / len(cuts))**(1. / power)
コード例 #2
0
def compute_theil_on_bins(y_pred, mask, bin_indices, target_efficiencies,
                          sample_weight):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)

    # ignoring events from other classes
    y_pred = y_pred[mask]
    bin_indices = bin_indices[mask]
    sample_weight = sample_weight[mask]

    bin_weights = compute_bin_weights(bin_indices=bin_indices,
                                      sample_weight=sample_weight)
    cuts = compute_cut_for_efficiency(target_efficiencies,
                                      mask=numpy.ones(len(y_pred), dtype=bool),
                                      y_pred=y_pred,
                                      sample_weight=sample_weight)
    result = 0.
    for cut in cuts:
        bin_efficiencies = compute_bin_efficiencies(
            y_pred,
            bin_indices=bin_indices,
            cut=cut,
            sample_weight=sample_weight)
        result += theil(bin_efficiencies, weights=bin_weights)
    return result / len(cuts)
コード例 #3
0
def cvm_2samp(data1, data2, weights1=None, weights2=None, power=2.):
    """Computes Cramer-von Mises similarity on 2 samples,
    CvM = \int |F_2 - F_1|^p dF_1
    This implementation sorts the arrays each time,
    so inside loops it will be slow"""
    weights1 = check_sample_weight(data1, sample_weight=weights1)
    weights2 = check_sample_weight(data2, sample_weight=weights2)
    weights1 /= numpy.sum(weights1)
    weights2 /= numpy.sum(weights2)
    data = numpy.unique(numpy.concatenate([data1, data2]))
    bins = numpy.append(data, data[-1] + 1)
    weights1_new = numpy.histogram(data1, bins=bins, weights=weights1)[0]
    weights2_new = numpy.histogram(data2, bins=bins, weights=weights2)[0]
    F1 = compute_cdf(weights1_new)
    F2 = compute_cdf(weights2_new)
    return numpy.average(numpy.abs(F1 - F2)**power, weights=weights1_new)
コード例 #4
0
def cvm_2samp(data1, data2, weights1=None, weights2=None, power=2.):
    """Computes Cramer-von Mises similarity on 2 samples,
    CvM = \int |F_2 - F_1|^p dF_1
    This implementation sorts the arrays each time,
    so inside loops it will be slow"""
    weights1 = check_sample_weight(data1, sample_weight=weights1)
    weights2 = check_sample_weight(data2, sample_weight=weights2)
    weights1 /= numpy.sum(weights1)
    weights2 /= numpy.sum(weights2)
    data = numpy.unique(numpy.concatenate([data1, data2]))
    bins = numpy.append(data, data[-1] + 1)
    weights1_new = numpy.histogram(data1, bins=bins, weights=weights1)[0]
    weights2_new = numpy.histogram(data2, bins=bins, weights=weights2)[0]
    F1 = compute_cdf(weights1_new)
    F2 = compute_cdf(weights2_new)
    return numpy.average(numpy.abs(F1 - F2) ** power, weights=weights1_new)
コード例 #5
0
def compute_sde_on_groups(y_pred,
                          mask,
                          groups_indices,
                          target_efficiencies,
                          sample_weight=None,
                          power=2.):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights_by_indices(
        groups_indices, sample_weight=sample_weight)
    divided_weight = compute_divided_weight_by_indices(
        groups_indices, sample_weight=sample_weight * mask)

    cuts = compute_cut_for_efficiency(target_efficiencies,
                                      mask=mask,
                                      y_pred=y_pred,
                                      sample_weight=sample_weight)

    sde = 0.
    for cut in cuts:
        group_efficiencies = compute_group_efficiencies_by_indices(
            y_pred,
            groups_indices=groups_indices,
            cut=cut,
            divided_weight=divided_weight)
        # print('FROM SDE function', cut, group_efficiencies)
        sde += weighted_deviation(group_efficiencies,
                                  weights=group_weights,
                                  power=power)
    return (sde / len(cuts))**(1. / power)
コード例 #6
0
def group_based_cvm(y_pred, mask, sample_weight, groups_indices):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight)

    result = 0.
    global_data, global_weight, global_F = prepare_distribution(y_pred[mask], weights=sample_weight[mask])
    for group, group_weight in zip(groups_indices, group_weights):
        local_distribution = y_pred[group]
        local_weights = sample_weight[group]
        result += group_weight * _cvm_2samp_fast(global_data, local_distribution,
                                                 global_weight, local_weights, global_F)
    return result
コード例 #7
0
    def fit(self, X, y, sample_weight=None, iterations=100, loss=None):
        X, y = check_arrays(X, y)
        self.n_features = X.shape[1]
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        if loss is None:
            loss = BinomialDevianceLossFunction()
        loss.fit(X, y, sample_weight=sample_weight)

        self.coeffs = numpy.zeros(
            [self.compute_n_features(), 2**self.power_categories],
            dtype='float')
        y_pred = numpy.zeros(len(X), dtype='float')

        for iteration in range(iterations):
            print(iteration, loss(y_pred))

            for feature, feature_values in enumerate(
                    self.enumerate_features(X)):
                # TODO compute once per iteration!
                ngradient = loss.negative_gradient(y_pred)

                nominator = numpy.bincount(feature_values,
                                           weights=ngradient,
                                           minlength=2**self.power_categories)
                nominator -= 2 * self.l2_reg * self.coeffs[
                    feature, :] + self.l1_reg * numpy.sign(
                        self.coeffs[feature, :])

                denominator = numpy.abs(ngradient) * (1. -
                                                      numpy.abs(ngradient))
                denominator = numpy.bincount(
                    feature_values,
                    weights=denominator,
                    minlength=2**self.power_categories)
                denominator += 2 * self.l2_reg

                gradients = nominator / denominator
                right_gradients = gradients
                # those already zeros not to become nonzero
                mask = (self.coeffs[feature, :]
                        == 0) & (numpy.abs(gradients) < self.l1_reg)
                right_gradients[mask] = 0
                # those already not zeros
                old_coeffs = self.coeffs[feature, :]
                new_coeffs = old_coeffs + self.learning_rate * right_gradients
                new_coeffs[new_coeffs * old_coeffs < 0] = 0
                self.coeffs[feature, :] = new_coeffs
                y_diff = numpy.take(new_coeffs - old_coeffs, feature_values)
                y_pred += y_diff

        return self
コード例 #8
0
def compute_theil_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    groups_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight)
    divided_weight = compute_divided_weight_by_indices(groups_indices, sample_weight=sample_weight * mask)
    cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask,
                                      y_pred=y_pred, sample_weight=sample_weight)

    result = 0.
    for cut in cuts:
        groups_efficiencies = compute_group_efficiencies_by_indices(y_pred, groups_indices=groups_indices,
                                                         cut=cut, divided_weight=divided_weight)
        result += theil(groups_efficiencies, groups_weights)
    return result / len(cuts)
コード例 #9
0
def compute_sde_on_groups(y_pred, mask, groups_indices, target_efficiencies, sample_weight=None, power=2.):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights_by_indices(groups_indices, sample_weight=sample_weight)
    divided_weight = compute_divided_weight_by_indices(groups_indices, sample_weight=sample_weight * mask)

    cuts = compute_cut_for_efficiency(target_efficiencies, mask=mask, y_pred=y_pred, sample_weight=sample_weight)

    sde = 0.
    for cut in cuts:
        group_efficiencies = compute_group_efficiencies_by_indices(y_pred, groups_indices=groups_indices,
                                                        cut=cut, divided_weight=divided_weight)
        # print('FROM SDE function', cut, group_efficiencies)
        sde += weighted_deviation(group_efficiencies, weights=group_weights, power=power)
    return (sde / len(cuts)) ** (1. / power)
コード例 #10
0
def group_based_cvm(y_pred, mask, sample_weight, groups_indices):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    group_weights = compute_group_weights_by_indices(
        groups_indices, sample_weight=sample_weight)

    result = 0.
    global_data, global_weight, global_F = prepare_distribution(
        y_pred[mask], weights=sample_weight[mask])
    for group, group_weight in zip(groups_indices, group_weights):
        local_distribution = y_pred[group]
        local_weights = sample_weight[group]
        result += group_weight * _cvm_2samp_fast(
            global_data, local_distribution, global_weight, local_weights,
            global_F)
    return result
コード例 #11
0
ファイル: reweight.py プロジェクト: nickcdryan/hep_ml
 def _normalize_input(self, data, weights):
     """ Normalize input of reweighter
     :param data: array like of shape [n_samples] or [n_samples, n_features]
     :param weights: array-like of shape [n_samples] or None
     :return: tuple with
         data - numpy.array of shape [n_samples, n_features]
         weights - numpy.array of shape [n_samples] with mean = 1.
     """
     weights = check_sample_weight(data, sample_weight=weights, normalize=True)
     data = numpy.array(data)
     if len(data.shape) == 1:
         data = data[:, numpy.newaxis]
     if self.n_features_ is None:
         self.n_features_ = data.shape[1]
     assert self.n_features_ == data.shape[1], \
         'number of features is wrong: {} {}'.format(self.n_features_, data.shape[1])
     return data, weights
コード例 #12
0
def compute_theil_on_bins(y_pred, mask, bin_indices, target_efficiencies, sample_weight):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)

    # ignoring events from other classes
    y_pred = y_pred[mask]
    bin_indices = bin_indices[mask]
    sample_weight = sample_weight[mask]

    bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight)
    cuts = compute_cut_for_efficiency(target_efficiencies, mask=numpy.ones(len(y_pred), dtype=bool),
                                      y_pred=y_pred, sample_weight=sample_weight)
    result = 0.
    for cut in cuts:
        bin_efficiencies = compute_bin_efficiencies(y_pred, bin_indices=bin_indices,
                                                    cut=cut, sample_weight=sample_weight)
        result += theil(bin_efficiencies, weights=bin_weights)
    return result / len(cuts)
コード例 #13
0
def compute_sde_on_bins(y_pred, mask, bin_indices, target_efficiencies, power=2., sample_weight=None):
    # ignoring events from other classes
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    y_pred = y_pred[mask]
    bin_indices = bin_indices[mask]
    sample_weight = sample_weight[mask]

    bin_weights = compute_bin_weights(bin_indices=bin_indices, sample_weight=sample_weight)
    cuts = compute_cut_for_efficiency(target_efficiencies, mask=numpy.ones(len(y_pred), dtype=bool),
                                      y_pred=y_pred, sample_weight=sample_weight)

    result = 0.
    for cut in cuts:
        bin_efficiencies = compute_bin_efficiencies(y_pred, bin_indices=bin_indices,
                                                    cut=cut, sample_weight=sample_weight)
        result += weighted_deviation(bin_efficiencies, weights=bin_weights, power=power)

    return (result / len(cuts)) ** (1. / power)
コード例 #14
0
ファイル: reweight.py プロジェクト: remenska/hep_ml
 def _normalize_input(self, data, weights):
     """ Normalize input of reweighter
     :param data: array like of shape [n_samples] or [n_samples, n_features]
     :param weights: array-like of shape [n_samples] or None
     :return: tuple with
         data - numpy.array of shape [n_samples, n_features]
         weights - numpy.array of shape [n_samples] with mean = 1.
     """
     weights = check_sample_weight(data,
                                   sample_weight=weights,
                                   normalize=True)
     data = numpy.array(data)
     if len(data.shape) == 1:
         data = data[:, numpy.newaxis]
     if self.n_features_ is None:
         self.n_features_ = data.shape[1]
     assert self.n_features_ == data.shape[1], \
         'number of features is wrong: {} {}'.format(self.n_features_, data.shape[1])
     return data, weights
コード例 #15
0
def compute_theil_on_groups(y_pred, mask, groups_indices, target_efficiencies,
                            sample_weight):
    y_pred = column_or_1d(y_pred)
    sample_weight = check_sample_weight(y_pred, sample_weight=sample_weight)
    groups_weights = compute_group_weights_by_indices(
        groups_indices, sample_weight=sample_weight)
    divided_weight = compute_divided_weight_by_indices(
        groups_indices, sample_weight=sample_weight * mask)
    cuts = compute_cut_for_efficiency(target_efficiencies,
                                      mask=mask,
                                      y_pred=y_pred,
                                      sample_weight=sample_weight)

    result = 0.
    for cut in cuts:
        groups_efficiencies = compute_group_efficiencies_by_indices(
            y_pred,
            groups_indices=groups_indices,
            cut=cut,
            divided_weight=divided_weight)
        result += theil(groups_efficiencies, groups_weights)
    return result / len(cuts)
コード例 #16
0
    def fit(self, X, y, sample_weight=None, iterations=100, loss=None):
        X, y = check_arrays(X, y)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        if loss is None:
            loss = BinomialDevianceLossFunction()
        loss.fit(X, y, sample_weight=sample_weight)
        self.n_features = X.shape[1]
        self.coeffs = numpy.zeros([self.n_features, self.max_categories], dtype='float')
        assert numpy.max(X) < self.max_categories
        assert numpy.min(X) >= 0

        for iteration in range(iterations):
            # this line could be skipped, but we need it to avoid
            # mistakes after too many steps of computations
            y_pred = self.decision_function(X)
            print(iteration, loss(y_pred))

            for feature in range(self.n_features):
                ngradient = loss.negative_gradient(y_pred)
                nominator = numpy.bincount(X[:, feature], weights=ngradient, minlength=self.max_categories)
                nominator -= self.l2_reg * self.coeffs[feature, :] + self.l1_reg * numpy.sign(self.coeffs[feature, :])

                denominator = numpy.abs(ngradient) * (1. - numpy.abs(ngradient))
                denominator = numpy.bincount(X[:, feature], weights=denominator, minlength=self.max_categories)
                denominator += 2 * self.l2_reg + 5

                gradients = nominator / denominator
                right_gradients = gradients
                # those already zeros not to become nonzero
                mask = (self.coeffs[feature, :] == 0) & (numpy.abs(gradients) < self.l1_reg)
                right_gradients[mask] = 0
                # those already not zeros
                old_coeffs = self.coeffs[feature, :]
                new_coeffs = old_coeffs + self.learning_rate * right_gradients
                new_coeffs[new_coeffs * old_coeffs < 0] = 0
                y_pred += numpy.take(new_coeffs - old_coeffs, X[:, feature])
                self.coeffs[feature, :] = new_coeffs

        return self
コード例 #17
0
    def fit(self, X, y, sample_weight=None, iterations=100, loss=None):
        X, y = check_arrays(X, y)
        self.n_features = X.shape[1]
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        if loss is None:
            loss = BinomialDevianceLossFunction()
        loss.fit(X, y, sample_weight=sample_weight)

        self.coeffs = numpy.zeros([self.compute_n_features(), 2 ** self.power_categories], dtype='float')
        y_pred = numpy.zeros(len(X), dtype='float')

        for iteration in range(iterations):
            print(iteration, loss(y_pred))

            for feature, feature_values in enumerate(self.enumerate_features(X)):
                # TODO compute once per iteration!
                ngradient = loss.negative_gradient(y_pred)

                nominator = numpy.bincount(feature_values, weights=ngradient, minlength=2 ** self.power_categories)
                nominator -= 2 * self.l2_reg * self.coeffs[feature, :] + self.l1_reg * numpy.sign(self.coeffs[feature, :])

                denominator = numpy.abs(ngradient) * (1. - numpy.abs(ngradient))
                denominator = numpy.bincount(feature_values, weights=denominator, minlength=2 ** self.power_categories)
                denominator += 2 * self.l2_reg

                gradients = nominator / denominator
                right_gradients = gradients
                # those already zeros not to become nonzero
                mask = (self.coeffs[feature, :] == 0) & (numpy.abs(gradients) < self.l1_reg)
                right_gradients[mask] = 0
                # those already not zeros
                old_coeffs = self.coeffs[feature, :]
                new_coeffs = old_coeffs + self.learning_rate * right_gradients
                new_coeffs[new_coeffs * old_coeffs < 0] = 0
                self.coeffs[feature, :] = new_coeffs
                y_diff = numpy.take(new_coeffs - old_coeffs, feature_values)
                y_pred += y_diff

        return self
コード例 #18
0
    def fit(self,sample_weight,values_init=None):
        X = self.X

        assert isinstance(X, pandas.DataFrame), 'please pass pandas.DataFrame first'
        for column_name, column_range in self.column_ranges.items():
            lower, upper = column_range
            assert numpy.all(X[column_name] >= lower) and numpy.all(X[column_name] <= upper), \
                '{} out of range'.format(column_name)

        pos_weight = check_sample_weight(X, sample_weight=sample_weight,normalize = True)
        self.Function = lambda parameters:self.Tfunction(parameters,pos_weight)
        self.Derivative = lambda parameters:self.Tderivative(parameters,pos_weight)

        lower_boundary = numpy.zeros(len(self.parameters_ranges))
        upper_boundary = numpy.zeros(len(self.parameters_ranges))
        initial_values = numpy.zeros(len(self.parameters_ranges))

        for i, (param_name, param_range) in enumerate(self.parameters_ranges.items()):
            if len(param_range) == 2:
                lower_boundary[i], upper_boundary[i] = param_range
                initial_values[i] = (lower_boundary[i] + upper_boundary[i]) / 2.
            else:
                lower_boundary[i], initial_values[i], upper_boundary[i] = param_range
                assert lower_boundary[i] <= initial_values[i] <= upper_boundary[i], \
                    'For variable {} passed initial value was outside range'.format(param_name)

        if values_init is not None:
            assert type(values_init) is dict
            for i,param_name in enumerate(self.parameters_ranges.keys()):
                if param_name in values_init:
                    initial_values[i] = values_init[param_name]
        
        self.optimization_result = minimize(self.Function, initial_values, jac=self.Derivative,
                                            # hessp=self.HessianTimesP,
                                            bounds=list(self.parameters_ranges.values()),
                                            #options={"disp":True}
                                            )
        self.parameters = OrderedDict(zip(self.parameters_ranges, self.optimization_result.x))
コード例 #19
0
ファイル: uboost.py プロジェクト: chrinide/hep_ml
 def _normalize_weight(y, weight):
     # frequently algorithm assigns very big weight to signal events
     # compared to background ones (or visa versa, if want to be uniform in bck)
     return commonutils.check_sample_weight(y, sample_weight=weight, normalize=True, normalize_by_class=True)
コード例 #20
0
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration, self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(cl.predict(X) * rate for rate, cl in zip(self.learning_rates, self.classifiers))


            self.loss_values.append(self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration, self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf, iteration, self.n_estimators, use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(len(sample_weight))
            residual = self.loss.negative_gradient(y, y_pred, sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask], check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag, self.random_state)
                self.loss.update_terminal_regions(tree.tree_, X=iter_X, y=y, residual=residual, pred=y_pred,
                                                  sample_mask=update_mask, sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate, iteration, self.n_estimators, use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
コード例 #21
0
ファイル: uboost.py プロジェクト: chrinide/hep_ml
    def fit(self, X, y, sample_weight=None, neighbours_matrix=None):
        """Build a boosted classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The training input samples.

        y : array-like of shape = [n_samples]
            The target values (integers that correspond to classes).

        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            ``1 / n_samples``.

        neighbours_matrix: array-like of shape [n_samples, n_neighbours],
            each row contains indices of signal neighbours
            (neighbours should be computed for background too),
            if None, this matrix is computed.

        Returns
        -------
        self : object
            Returns self.
        """
        if self.smoothing < 0:
            raise ValueError("Smoothing must be non-negative")
        if not isinstance(self.base_estimator, BaseEstimator):
            raise TypeError("estimator must be a subclass of BaseEstimator")
        if self.n_estimators <= 0:
            raise ValueError("n_estimators must be greater than zero.")
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        # Check that algorithm is supported
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported"
                             % self.algorithm)
        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator, 'predict_proba'):
                raise TypeError(
                    "uBoostBDT with algorithm='SAMME.R' requires "
                    "that the weak learner have a predict_proba method.\n"
                    "Please change the base estimator or set "
                    "algorithm='SAMME' instead.")

        assert np.in1d(y, [0, 1]).all(), \
            "only two-class classification is implemented, with labels 0 and 1"
        self.signed_uniform_label = 2 * self.uniform_label - 1

        if neighbours_matrix is not None:
            assert np.shape(neighbours_matrix) == (len(X), self.n_neighbors), \
                "Wrong shape of neighbours_matrix"
            self.knn_indices = neighbours_matrix
        else:
            assert self.uniform_variables is not None, \
                "uniform_variables should be set"
            self.knn_indices = compute_knn_indices_of_same_class(
                X.ix[:, self.uniform_variables], y, self.n_neighbors)

        sample_weight = commonutils.check_sample_weight(y, sample_weight=sample_weight, normalize=True)
        assert np.all(sample_weight >= 0.), 'the weights should be non-negative'

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = []
        # score cuts correspond to
        # global efficiency == target_efficiency on each iteration.
        self.score_cuts_ = []

        X_train_variables = self.get_train_vars(X)
        X_train_variables, y, sample_weight = check_xyw(X_train_variables, y, sample_weight)

        # A dictionary to keep all intermediate weights, efficiencies and so on
        if self.keep_debug_info:
            self.debug_dict = defaultdict(list)

        self.random_generator = check_random_state(self.random_state)

        self._boost(X_train_variables, y, sample_weight)

        self.score_cut = self.signed_uniform_label * compute_cut_for_efficiency(
            self.target_efficiency, y == self.uniform_label, self.predict_score(X) * self.signed_uniform_label)
        assert np.allclose(self.score_cut, self.score_cuts_[-1], rtol=1e-10, atol=1e-10), \
            "score cut doesn't appear to coincide with the staged one"
        assert len(self.estimators_) == len(self.estimator_weights_) == len(self.score_cuts_)
        return self
コード例 #22
0
 def __call__(self, y, y_pred, sample_weight=None):
     signed_multiplier = self._signed_multiplier(y)
     weight_multiplier = self._weight_multiplier(y)
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return numpy.sum(sample_weight * weight_multiplier * numpy.exp(y_pred * signed_multiplier))
コード例 #23
0
 def negative_gradient(self, y, y_pred, sample_weight=None, **kargs):
     multiplier = self._signed_multiplier(y)
     y_signed = 2. * y - 1
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return sample_weight * y_signed * numpy.exp(y_pred * multiplier)
コード例 #24
0
 def __call__(self, y, y_pred, sample_weight=None):
     y_signed = 2. * y - 1
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return numpy.sum(sample_weight * numpy.log(1 + numpy.exp(- y_signed * y_pred - self.shift)))
コード例 #25
0
 def negative_gradient(self, y, y_pred, sample_weight=None):
     y_signed = 2. * y - 1
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return sample_weight * y_signed * expit(-y_signed * y_pred - self.shift)
コード例 #26
0
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X,
                            y,
                            dtype=DTYPE,
                            sparse_format="dense",
                            check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(
            y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration,
                                                self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X,
                                           dtype=DTYPE,
                                           sparse_format="dense",
                                           check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(
                        cl.predict(X) * rate for rate, cl in zip(
                            self.learning_rates, self.classifiers))

            self.loss_values.append(
                self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration,
                                      self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag,
                                              self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(
                len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(
                len(sample_weight))
            residual = self.loss.negative_gradient(y,
                                                   y_pred,
                                                   sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask],
                     check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag,
                                                      self.random_state)
                self.loss.update_terminal_regions(tree.tree_,
                                                  X=iter_X,
                                                  y=y,
                                                  residual=residual,
                                                  pred=y_pred,
                                                  sample_mask=update_mask,
                                                  sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
コード例 #27
0
 def negative_gradient(self, y, y_pred, sample_weight=None, **kargs):
     multiplier = self._signed_multiplier(y)
     y_signed = 2. * y - 1
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return sample_weight * y_signed * numpy.exp(y_pred * multiplier)
コード例 #28
0
 def __call__(self, y, y_pred, sample_weight=None):
     signed_multiplier = self._signed_multiplier(y)
     weight_multiplier = self._weight_multiplier(y)
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return numpy.sum(sample_weight * weight_multiplier *
                      numpy.exp(y_pred * signed_multiplier))
コード例 #29
0
 def negative_gradient(self, y, y_pred, sample_weight=None):
     y_signed = 2. * y - 1
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return sample_weight * y_signed * expit(-y_signed * y_pred -
                                             self.shift)
コード例 #30
0
 def __call__(self, y, y_pred, sample_weight=None):
     y_signed = 2. * y - 1
     sample_weight = check_sample_weight(y, sample_weight=sample_weight)
     return numpy.sum(
         sample_weight *
         numpy.log(1 + numpy.exp(-y_signed * y_pred - self.shift)))