def calc_swap_deltas(self, qid, targets): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) satisfied_probs = np.zeros(n_targets) prefix_sums = np.zeros(n_targets + 1) point_residuals = np.ones(n_targets + 1) for i, t in enumerate(targets): assert t <= self.highest_score sprob = self._get_satisfied_prob(t) satisfied_probs[i] = sprob prefix_sums[i + 1] = ( prefix_sums[i] + ((point_residuals[i] * sprob / (1.0 + i)) if i < self.k else 0.0)) point_residuals[i + 1] = point_residuals[i] * (1.0 - sprob) for i in range(min(n_targets, self.k)): for j in range(i + 1, n_targets): if satisfied_probs[i] == satisfied_probs[j]: continue ratio = (1.0 - satisfied_probs[j]) / (1.0 - satisfied_probs[i]) deltas[i, j] = ( # delta on i-th position ((satisfied_probs[j] - satisfied_probs[i]) * point_residuals[i] / (i + 1.0)) + # delta on i+1 to j-1 positions (prefix_sums[j] - prefix_sums[i + 1]) * (ratio - 1.0) + # delta on j-th position (((point_residuals[j] / (j + 1.0)) * (satisfied_probs[i] * ratio - satisfied_probs[j])) if j < self.k else 0.0)) return deltas
def calc_swap_deltas(self, qid, targets): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) satisfied_probs = np.zeros(n_targets) prefix_sums = np.zeros(n_targets + 1) point_residuals = np.ones(n_targets + 1) for i, t in enumerate(targets): assert t <= self.highest_score sprob = self._get_satisfied_prob(t) satisfied_probs[i] = sprob prefix_sums[i + 1] = (prefix_sums[i] + ((point_residuals[i] * sprob / (1.0 + i)) if i < self.k else 0.0)) point_residuals[i + 1] = point_residuals[i] * (1.0 - sprob) for i in range(min(n_targets, self.k)): for j in range(i + 1, n_targets): if satisfied_probs[i] == satisfied_probs[j]: continue ratio = (1.0 - satisfied_probs[j]) / (1.0 - satisfied_probs[i]) deltas[i, j] = ( # delta on i-th position ((satisfied_probs[j] - satisfied_probs[i]) * point_residuals[i] / (i + 1.0)) + # delta on i+1 to j-1 positions (prefix_sums[j] - prefix_sums[i + 1]) * (ratio - 1.0) + # delta on j-th position (((point_residuals[j] / (j + 1.0)) * (satisfied_probs[i] * ratio - satisfied_probs[j])) if j < self.k else 0.0)) return deltas
def calc_swap_deltas(self, qid, targets): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) total_num_rel = 0 total_metric = 0.0 for i in range(min(n_targets, self.k)): if targets[i] >= self.cutoff: total_num_rel += 1 total_metric += total_num_rel / (i + 1.0) metric = (total_metric / total_num_rel) if total_num_rel > 0 else 0.0 num_rel_i = 0 for i in range(min(n_targets, self.k)): if targets[i] >= self.cutoff: num_rel_i += 1 num_rel_j = num_rel_i sub = num_rel_i / (i + 1.0) for j in range(i + 1, n_targets): if targets[j] >= self.cutoff: if j < self.k: num_rel_j += 1 sub += 1 / (j + 1.0) else: add = (num_rel_j / (j + 1.0)) if j < self.k else 0.0 new_total_metric = total_metric + add - sub new_num_rel = (total_num_rel if j < self.k else (total_num_rel - 1)) new_metric = ((new_total_metric / new_num_rel) if new_num_rel > 0 else 0.0) deltas[i, j] = new_metric - metric else: num_rel_j = num_rel_i add = (num_rel_i + 1) / (i + 1.0) for j in range(i + 1, n_targets): if targets[j] >= self.cutoff: sub = (((num_rel_j + 1) / (j + 1.0)) if j < self.k else 0.0) new_total_metric = total_metric + add - sub new_num_rel = (total_num_rel if j < self.k else (total_num_rel + 1)) new_metric = ((new_total_metric / new_num_rel) if new_num_rel > 0 else 0.0) deltas[i, j] = new_metric - metric if j < self.k: num_rel_j += 1 add += 1 / (j + 1.0) return deltas
def calc_swap_deltas(self, qid, targets): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) total_num_rel = 0 total_metric = 0.0 for i in range(min(n_targets, self.k)): if targets[i] >= self.cutoff: total_num_rel += 1 total_metric += total_num_rel / (i + 1.0) metric = (total_metric / total_num_rel) if total_num_rel > 0 else 0.0 num_rel_i = 0 for i in range(min(n_targets, self.k)): if targets[i] >= self.cutoff: num_rel_i += 1 num_rel_j = num_rel_i sub = num_rel_i / (i + 1.0) for j in range(i + 1, n_targets): if targets[j] >= self.cutoff: if j < self.k: num_rel_j += 1 sub += 1 / (j + 1.0) else: add = (num_rel_j / (j + 1.0)) if j < self.k else 0.0 new_total_metric = total_metric + add - sub new_num_rel = (total_num_rel if j < self.k else (total_num_rel - 1)) new_metric = ((new_total_metric / new_num_rel) if new_num_rel > 0 else 0.0) deltas[i, j] = new_metric - metric else: num_rel_j = num_rel_i add = (num_rel_i + 1) / (i + 1.0) for j in range(i + 1, n_targets): if targets[j] >= self.cutoff: sub = (((num_rel_j + 1) / (j + 1.0)) if j < self.k else 0.0) new_total_metric = total_metric + add - sub new_num_rel = (total_num_rel if j < self.k else (total_num_rel + 1)) new_metric = ((new_total_metric / new_num_rel) if new_num_rel > 0 else 0.0) deltas[i, j] = new_metric - metric if j < self.k: num_rel_j += 1 add += 1 / (j + 1.0) return deltas
def calc_swap_deltas(self, qid, targets, coeff=1.0): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) for i in range(min(n_targets, self.k)): for j in range(i + 1, n_targets): deltas[i, j] = coeff * \ (self._gain_fn(targets[i]) - self._gain_fn(targets[j])) * \ (self._get_discount(j) - self._get_discount(i)) return deltas
def calc_swap_deltas(self, qid, targets, coeff=1.0): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) for i in range(min(n_targets, self.k)): for j in range(i + 1, n_targets): deltas[i, j] = coeff * \ (self._gain_fn(targets[i]) - self._gain_fn(targets[j])) * \ (self._get_discount(j) - self._get_discount(i)) return deltas
def calc_random_ev(self, qid, targets): """Calculates the expectied value of the metric on randomized targets. This implementation just averages the metric over 100 shuffles. Not implemented for non-LTR metrics. Parameters ---------- qid : object See `evaluate`. targets : array_like of shape = [n_targets] See `evaluate`. Returns ------- float Expected value of the metric from random ordering of targets. """ targets = np.copy(targets) scores = [] for _ in range(100): np.random.shuffle(targets) scores.append(self.evaluate(qid, targets)) return np.mean(scores)
def _dump_svmlight(X, y, f, one_based, comment, query_id): is_sp = int(hasattr(X, "tocsr")) if X.dtype.kind == 'i': value_pattern = u("%d:%d") else: value_pattern = u("%d:%.16g") line_pattern = u("%s") line_pattern += u(" %s\n") for i in range(X.shape[0]): if is_sp: span = slice(X.indptr[i], X.indptr[i + 1]) row = zip(X.indices[span], X.data[span]) else: nz = X[i] != 0 row = zip(np.where(nz)[0], X[i, nz]) s = " ".join(value_pattern % (j + one_based, x) for j, x in row) label = "" first = True for l in y[i]: if not first: label += "," label += str(int(l)) first = False feat = (label, s) f.write((line_pattern % feat).encode('ascii'))
def calc_random_ev(self, qid, targets): """Calculates the expectied value of the metric on randomized targets. This implementation just averages the metric over 100 shuffles. Not implemented for non-LTR metrics. Parameters ---------- qid : object See `evaluate`. targets : array_like of shape = [n_targets] See `evaluate`. Returns ------- float Expected value of the metric from random ordering of targets. """ targets = np.copy(targets) scores = [] for _ in range(100): np.random.shuffle(targets) scores.append(self.evaluate(qid, targets)) return np.mean(scores)
def evaluate(self, qid, targets): num_rel = 0 total_prec = 0.0 for i in range(min(len(targets), self.k)): if targets[i] >= self.cutoff: num_rel += 1 total_prec += num_rel / (i + 1.0) return (total_prec / num_rel) if num_rel > 0 else 0.0
def evaluate(self, qid, targets): num_rel = 0 total_prec = 0.0 for i in range(min(len(targets), self.k)): if targets[i] >= self.cutoff: num_rel += 1 total_prec += num_rel / (i + 1.0) return (total_prec / num_rel) if num_rel > 0 else 0.0
def _partial_dependence_recursion(est, grid, target_variables): # grid needs to be DTYPE grid = np.asarray(grid, dtype=DTYPE, order='C') n_trees_per_stage = est.estimators_.shape[1] n_estimators = est.estimators_.shape[0] learning_rate = est.learning_rate averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64, order='C') for stage in range(n_estimators): for k in range(n_trees_per_stage): tree = est.estimators_[stage, k].tree_ _partial_dependence_tree(tree, grid, target_variables, learning_rate, averaged_predictions[k]) return averaged_predictions
def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100): """Generate a grid of points based on the ``percentiles of ``X``. The grid is a cartesian product between the columns of Z. The ith column of Z consists in ``grid_resolution`` equally-spaced points between the percentiles of the ith column of X. If ``grid_resolution`` is bigger than the number of unique values in the ith column of X, then those unique values will be used instead. Parameters ---------- X : ndarray The data percentiles : tuple of floats The percentiles which are used to construct the extreme values of the grid. grid_resolution : int The number of equally spaced points to be placed on the grid for a given column. Returns ------- grid : ndarray, shape=(n_points, X.shape[1]) All data points on the grid. n_points is always ``<= grid_resolution ** X.shape[1]``. Z: list of ndarray The values with which the grid has been created. The ndarrays may be of different shape: either (grid_resolution,) or (n_unique_values,). """ try: assert len(percentiles) == 2 except (AssertionError, TypeError): raise ValueError('percentiles must be a sequence of 2 elements.') if not all(0. <= x <= 1. for x in percentiles): raise ValueError('percentiles values must be in [0, 1].') if percentiles[0] >= percentiles[1]: raise ValueError('percentiles[0] must be strictly less ' 'than percentiles[1].') if grid_resolution <= 1: raise ValueError('grid_resolution must be strictly greater than 1.') values = [] for feature in range(X.shape[1]): uniques = np.unique(X[:, feature]) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: # create axis based on percentiles and grid resolution emp_percentiles = mquantiles(X, prob=percentiles, axis=0) if np.allclose(emp_percentiles[0, feature], emp_percentiles[1, feature]): raise ValueError('percentiles are too close to each other, ' 'unable to build the grid.') axis = np.linspace(emp_percentiles[0, feature], emp_percentiles[1, feature], num=grid_resolution, endpoint=True) values.append(axis) return cartesian(values), values
def test_numeric_stability(): X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1) Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1) # Test up to discretizing nano units for i in range(1, 9): X = X_init / 10**i Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X) assert_array_equal(Xt_expected, Xt)
def evaluate(self, qid, targets): n_targets = len(targets) num_rel = 0. for i in range(n_targets): if i >= self.k: break if targets[i] >= self.cutoff: num_rel += 1 return (num_rel / self.k)
def test_numeric_stability(): X_init = np.array([2., 4., 6., 8., 10.]).reshape(-1, 1) Xt_expected = np.array([0, 0, 1, 1, 1]).reshape(-1, 1) # Test up to discretizing nano units for i in range(1, 9): X = X_init / 10**i Xt = KBinsDiscretizer(n_bins=2, encode='ordinal').fit_transform(X) assert_array_equal(Xt_expected, Xt)
def evaluate(self, qid, targets): n_targets = len(targets) num_rel = 0 total_prec = 0.0 for i in range(n_targets): if targets[i] >= self.cutoff: num_rel += 1 if i < self.k: total_prec += num_rel / (i + 1.0) return (total_prec / num_rel) if num_rel > 0 else 0.0
def _calc_lambdas_deltas(self, qid, y, y_pred): ns = y.shape[0] positions = get_sorted_y_positions(y, y_pred, check=False) actual = y[positions] swap_deltas = self.metric.calc_swap_deltas(qid, actual) max_k = self.metric.max_k() if max_k is None or ns < max_k: max_k = ns lambdas = np.zeros(ns) deltas = np.zeros(ns) for i in range(max_k): for j in range(i + 1, ns): if actual[i] == actual[j]: continue delta_metric = swap_deltas[i, j] if delta_metric == 0.0: continue a, b = positions[i], positions[j] # invariant: y_pred[a] >= y_pred[b] if actual[i] < actual[j]: assert delta_metric > 0.0 logistic = scipy.special.expit(y_pred[a] - y_pred[b]) l = logistic * delta_metric lambdas[a] -= l lambdas[b] += l else: assert delta_metric < 0.0 logistic = scipy.special.expit(y_pred[b] - y_pred[a]) l = logistic * -delta_metric lambdas[a] += l lambdas[b] -= l gradient = (1 - logistic) * l deltas[a] += gradient deltas[b] += gradient return lambdas, deltas
def _calc_lambdas_deltas(self, qid, y, y_pred): ns = y.shape[0] positions = get_sorted_y_positions(y, y_pred, check=False) actual = y[positions] swap_deltas = self.metric.calc_swap_deltas(qid, actual) max_k = self.metric.max_k() if max_k is None or ns < max_k: max_k = ns lambdas = np.zeros(ns) deltas = np.zeros(ns) for i in range(max_k): for j in range(i + 1, ns): if actual[i] == actual[j]: continue delta_metric = swap_deltas[i, j] if delta_metric == 0.0: continue a, b = positions[i], positions[j] # invariant: y_pred[a] >= y_pred[b] if actual[i] < actual[j]: assert delta_metric > 0.0 logistic = scipy.special.expit(y_pred[a] - y_pred[b]) l = logistic * delta_metric lambdas[a] -= l lambdas[b] += l else: assert delta_metric < 0.0 logistic = scipy.special.expit(y_pred[b] - y_pred[a]) l = logistic * -delta_metric lambdas[a] += l lambdas[b] -= l gradient = (1 - logistic) * l deltas[a] += gradient deltas[b] += gradient return lambdas, deltas
def calc_swap_deltas(self, qid, targets): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) rel = np.array(targets) >= self.cutoff total_num_rel = sum(rel) if total_num_rel == 0 or total_num_rel == n_targets: return deltas denom = total_num_rel * float(n_targets - total_num_rel) for i in range(n_targets): irel = rel[i] for j in range(i + 1, n_targets): jrel = rel[j] if not irel and jrel: deltas[i, j] = (j - i) / denom elif irel and not jrel: deltas[i, j] = (i - j) / denom return deltas
def calc_swap_deltas(self, qid, targets): n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) rel = np.array(targets) >= self.cutoff total_num_rel = sum(rel) if total_num_rel == 0 or total_num_rel == n_targets: return deltas denom = total_num_rel * float(n_targets - total_num_rel) for i in range(n_targets): irel = rel[i] for j in range(i + 1, n_targets): jrel = rel[j] if not irel and jrel: deltas[i, j] = (j - i) / denom elif irel and not jrel: deltas[i, j] = (i - j) / denom return deltas
def transform(self, X): ''' Compute kernels from X to :attr:`features_`. Parameters ---------- X : list of arrays or :class:`skl_groups.features.Features` The bags to compute "from". Must have same dimension as :attr:`features_`. Returns ------- K : array of shape ``[len(X), len(features_)]`` The kernel evaluations from X to :attr:`features_`. ''' X = as_features(X, stack=True, bare=True) Y = self.features_ if X.dim != Y.dim: raise ValueError( "MMK transform got dimension {} but had {} at fit".format( X.dim, Y.dim)) pointwise = pairwise_kernels(X.stacked_features, Y.stacked_features, metric=self.kernel, filter_params=True, **self._get_kernel_params()) # TODO: is there a way to do this without a Python loop? K = np.empty((len(X), len(Y))) for i in range(len(X)): for j in range(len(Y)): K[i, j] = pointwise[X._boundaries[i]:X._boundaries[i + 1], Y._boundaries[j]:Y._boundaries[j + 1]].mean() return K
def test_mmk(): bags = [np.random.normal(size=(np.random.randint(10, 100), 10)) for _ in range(20)] res = MeanMapKernel(gamma=2.38).fit_transform(bags) for i in range(20): for j in range(20): exp = pairwise_kernels(bags[j], bags[i], metric='rbf', gamma=2.38) assert_almost_equal(res[i, j], exp.mean(), err_msg="({} to {})".format(i, j)) res = MeanMapKernel(kernel='linear').fit(bags[:5]).transform(bags[-2:]) for i in range(5): for j in range(18, 20): exp = pairwise_kernels(bags[j], bags[i], metric='linear') assert_almost_equal(res[j - 18, i], exp.mean(), err_msg="({} to {})".format(i, j)) # fails on wrong dimension assert_raises( ValueError, lambda:MeanMapKernel().fit(bags).transform([np.random.randn(20, 8)]))
def calc_swap_deltas(self, qid, targets): """Returns an upper triangular matrix. Each (i, j) contains the change in the metric from swapping targets[i, j]. Parameters ---------- qid : object See `evaluate`. targets : array_like of shape = [n_targets] See `evaluate`. Returns ------- deltas = array_like of shape = [n_targets, n_targets] Upper triangular matrix, where ``deltas[i, j]`` is the change in the metric from swapping ``targets[i]`` with ``targets[j]``. """ n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) original = self.evaluate(qid, targets) max_k = self.max_k() if max_k is None or n_targets < max_k: max_k = n_targets for i in range(max_k): for j in range(i + 1, n_targets): tmp = targets[i] targets[i] = targets[j] targets[j] = tmp deltas[i, j] = self.evaluate(qid, targets) - original tmp = targets[i] targets[i] = targets[j] targets[j] = tmp return deltas
def calc_swap_deltas(self, qid, targets): """Returns an upper triangular matrix. Each (i, j) contains the change in the metric from swapping targets[i, j]. Parameters ---------- qid : object See `evaluate`. targets : array_like of shape = [n_targets] See `evaluate`. Returns ------- deltas = array_like of shape = [n_targets, n_targets] Upper triangular matrix, where ``deltas[i, j]`` is the change in the metric from swapping ``targets[i]`` with ``targets[j]``. """ n_targets = len(targets) deltas = np.zeros((n_targets, n_targets)) original = self.evaluate(qid, targets) max_k = self.max_k() if max_k is None or n_targets < max_k: max_k = n_targets for i in range(max_k): for j in range(i + 1, n_targets): tmp = targets[i] targets[i] = targets[j] targets[j] = tmp deltas[i, j] = self.evaluate(qid, targets) - original tmp = targets[i] targets[i] = targets[j] targets[j] = tmp return deltas
def transform(self, X): ''' Compute kernels from X to :attr:`features_`. Parameters ---------- X : list of arrays or :class:`skl_groups.features.Features` The bags to compute "from". Must have same dimension as :attr:`features_`. Returns ------- K : array of shape ``[len(X), len(features_)]`` The kernel evaluations from X to :attr:`features_`. ''' X = as_features(X, stack=True, bare=True) Y = self.features_ if X.dim != Y.dim: raise ValueError("MMK transform got dimension {} but had {} at fit" .format(X.dim, Y.dim)) pointwise = pairwise_kernels(X.stacked_features, Y.stacked_features, metric=self.kernel, filter_params=True, **self._get_kernel_params()) # TODO: is there a way to do this without a Python loop? K = np.empty((len(X), len(Y))) for i in range(len(X)): for j in range(len(Y)): K[i, j] = pointwise[X._boundaries[i]:X._boundaries[i+1], Y._boundaries[j]:Y._boundaries[j+1]].mean() return K
def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100): """Generate a grid of points based on the ``percentiles of ``X``. The grid is generated by placing ``grid_resolution`` equally spaced points between the ``percentiles`` of each column of ``X``. Parameters ---------- X : ndarray The data percentiles : tuple of floats The percentiles which are used to construct the extreme values of the grid axes. grid_resolution : int The number of equally spaced points that are placed on the grid. Returns ------- grid : ndarray All data points on the grid; ``grid.shape[1] == X.shape[1]`` and ``grid.shape[0] == grid_resolution * X.shape[1]``. axes : seq of ndarray The axes with which the grid has been created. """ if len(percentiles) != 2: raise ValueError('percentile must be tuple of len 2') if not all(0. <= x <= 1. for x in percentiles): raise ValueError('percentile values must be in [0, 1]') axes = [] emp_percentiles = mquantiles(X, prob=percentiles, axis=0) for col in range(X.shape[1]): uniques = np.unique(X[:, col]) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: # create axis based on percentiles and grid resolution axis = np.linspace(emp_percentiles[0, col], emp_percentiles[1, col], num=grid_resolution, endpoint=True) axes.append(axis) return cartesian(axes), axes
def evaluate(self, qid, targets): n_targets = len(targets) if n_targets < 2: return 0.0 concordant, discordant = 0, 0 for i, t1 in enumerate(targets): for j in range(i + 1, n_targets): t2 = targets[j] if abs(t1 - t2) < _EPS: continue rank_higher = i < j score_higher = t1 > t2 if rank_higher == score_higher: concordant += 1 else: discordant += 1 return (concordant - discordant) / (n_targets * (n_targets - 1) / 2.0)
def evaluate(self, qid, targets): n_targets = len(targets) if n_targets < 2: return 0.0 concordant, discordant = 0, 0 for i, t1 in enumerate(targets): for j in range(i + 1, n_targets): t2 = targets[j] if abs(t1 - t2) < _EPS: continue rank_higher = i < j score_higher = t1 > t2 if rank_higher == score_higher: concordant += 1 else: discordant += 1 return (concordant - discordant) / (n_targets * (n_targets - 1) / 2.0)
def _pretty_print_score(self, score): if score.size == 1: return '%12.4f' % score return ''.join('%8.4f' % score[i] for i in range(score.size))
def plot_partial_dependence(gbrt, X, features, feature_names=None, label=None, n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), n_jobs=1, verbose=0, ax=None, line_kw=None, contour_kw=None, **fig_kw): """Partial dependence plots for ``features``. The ``len(features)`` plots are arranged in a grid with ``n_cols`` columns. Two-way partial dependence plots are plotted as contour plots. Read more in the :ref:`User Guide <partial_dependence>`. Parameters ---------- gbrt : BaseGradientBoosting A fitted gradient boosting model. X : array-like, shape=(n_samples, n_features) The data on which ``gbrt`` was trained. features : seq of tuples or ints If seq[i] is an int or a tuple with one int value, a one-way PDP is created; if seq[i] is a tuple of two ints, a two-way PDP is created. feature_names : seq of str Name of each feature; feature_names[i] holds the name of the feature with index i. label : object The class label for which the PDPs should be computed. Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``. n_cols : int The number of columns in the grid plot (default: 3). percentiles : (low, high), default=(0.05, 0.95) The lower and upper percentile used to create the extreme values for the PDP axes. grid_resolution : int, default=100 The number of equally spaced points on the axes. n_jobs : int The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. Defaults to 1. verbose : int Verbose output during PD computations. Defaults to 0. ax : Matplotlib axis object, default None An axis object onto which the plots will be drawn. line_kw : dict Dict with keywords passed to the ``pylab.plot`` call. For one-way partial dependence plots. contour_kw : dict Dict with keywords passed to the ``pylab.plot`` call. For two-way partial dependence plots. fig_kw : dict Dict with keywords passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns ------- fig : figure The Matplotlib Figure object. axs : seq of Axis objects A seq of Axis objects, one for each subplot. Examples -------- >>> from sklearn.datasets import make_friedman1 >>> from sklearn.ensemble import GradientBoostingRegressor >>> X, y = make_friedman1() >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP ... """ import matplotlib.pyplot as plt from matplotlib import transforms from matplotlib.ticker import MaxNLocator from matplotlib.ticker import ScalarFormatter # if not isinstance(gbrt, BaseGradientBoosting): # raise ValueError('gbrt has to be an instance of BaseGradientBoosting') if gbrt.estimators_.shape[0] == 0: raise ValueError('Call %s.fit before partial_dependence' % gbrt.__class__.__name__) # set label_idx for multi-class GBRT if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2: if label is None: raise ValueError('label is not given for multi-class PDP') label_idx = np.searchsorted(gbrt.classes_, label) if gbrt.classes_[label_idx] != label: raise ValueError('label %s not in ``gbrt.classes_``' % str(label)) else: # regression and binary classification label_idx = 0 X = check_array(X, dtype=DTYPE, order='C') if gbrt.n_features != X.shape[1]: raise ValueError('X.shape[1] does not match gbrt.n_features') if line_kw is None: line_kw = {'color': 'green'} if contour_kw is None: contour_kw = {} # convert feature_names to list if feature_names is None: # if not feature_names use fx indices as name feature_names = [str(i) for i in range(gbrt.n_features)] elif isinstance(feature_names, np.ndarray): feature_names = feature_names.tolist() def convert_feature(fx): if isinstance(fx, six.string_types): try: fx = feature_names.index(fx) except ValueError: raise ValueError('Feature %s not in feature_names' % fx) return fx # convert features into a seq of int tuples tmp_features = [] for fxs in features: if isinstance(fxs, (numbers.Integral, ) + six.string_types): fxs = (fxs, ) try: fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32) except TypeError: raise ValueError('features must be either int, str, or tuple ' 'of int/str') if not (1 <= np.size(fxs) <= 2): raise ValueError('target features must be either one or two') tmp_features.append(fxs) features = tmp_features names = [] try: for fxs in features: l = [] # explicit loop so "i" is bound for exception below for i in fxs: l.append(feature_names[i]) names.append(l) except IndexError: raise ValueError('features[i] must be in [0, n_features) ' 'but was %d' % i) # compute PD functions pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed( partial_dependence )(gbrt, fxs, X=X, grid_resolution=grid_resolution, percentiles=percentiles) for fxs in features) # get global min and max values of PD grouped by plot type pdp_lim = {} for pdp, axes in pd_result: min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max() n_fx = len(axes) old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) min_pd = min(min_pd, old_min_pd) max_pd = max(max_pd, old_max_pd) pdp_lim[n_fx] = (min_pd, max_pd) # create contour levels for two-way plots if 2 in pdp_lim: Z_level = np.linspace(*pdp_lim[2], num=8) if ax is None: fig = plt.figure(**fig_kw) else: fig = ax.get_figure() fig.clear() n_cols = min(n_cols, len(features)) n_rows = int(np.ceil(len(features) / float(n_cols))) axs = [] for i, fx, name, (pdp, axes) in zip(count(), features, names, pd_result): ax = fig.add_subplot(n_rows, n_cols, i + 1) if len(axes) == 1: ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw) else: # make contour plot assert len(axes) == 2 XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[label_idx].reshape(list(map(np.size, axes))).T CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k') ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], alpha=0.75, **contour_kw) ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) # plot data deciles + axes labels deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) ylim = ax.get_ylim() ax.vlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_xlabel(name[0]) ax.set_ylim(ylim) # prevent x-axis ticks from overlapping ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) tick_formatter = ScalarFormatter() tick_formatter.set_powerlimits((-3, 4)) ax.xaxis.set_major_formatter(tick_formatter) if len(axes) > 1: # two-way PDP - y-axis deciles + labels deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory( ax.transAxes, ax.transData) xlim = ax.get_xlim() ax.hlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_ylabel(name[1]) # hline erases xlim ax.set_xlim(xlim) else: ax.set_ylabel('Partial dependence') if len(axes) == 1: ax.set_ylim(pdp_lim[1]) axs.append(ax) fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, hspace=0.3) return fig, axs
def calc_lambdas_deltas(self, qid, targets, preds): """Returns the first and second (psuedo-)derivatives. Lambdas is the negative gradient of the loss with respect to the prediction. Deltas is the derivative of that. Parameters ---------- qid : object See `evaluate`. targets : array_like of shape = [n_targets] See `evaluate`. preds : array_like of shape = [n_targets] List of predicted scores corresponding to the targets. Returns ------- lambdas = array_like of shape = [n_targets] deltas = array_like of shape = [n_targets] """ ns = targets.shape[0] positions = get_sorted_y_positions(targets, preds, check=False) actual = targets[positions] swap_deltas = self.calc_swap_deltas(qid, actual) max_k = self.max_k() if max_k is None or ns < max_k: max_k = ns lambdas = np.zeros(ns) deltas = np.zeros(ns) for i in range(max_k): for j in range(i + 1, ns): if actual[i] == actual[j]: continue delta_metric = swap_deltas[i, j] if delta_metric == 0.0: continue a, b = positions[i], positions[j] # invariant: preds[a] >= preds[b] if actual[i] < actual[j]: assert delta_metric > 0.0 logistic = scipy.special.expit(preds[a] - preds[b]) l = logistic * delta_metric lambdas[a] -= l lambdas[b] += l else: assert delta_metric < 0.0 logistic = scipy.special.expit(preds[b] - preds[a]) l = logistic * -delta_metric lambdas[a] += l lambdas[b] -= l hess = (1 - logistic) * l deltas[a] += hess deltas[b] += hess return lambdas, deltas
def _pretty_print_score(self, score): if score.size == 1: return '%12.4f' % score return ''.join('%8.4f' % score[i] for i in range(score.size))
def calc_random_ev(self, qid, targets): total_gains = sum(self._gain_fn(t) for t in targets) total_discounts = sum( self._get_discount(i) for i in range(min(self.k, len(targets)))) return total_gains * total_discounts / len(targets)
def _fit_stages(self, X, y, qids, y_pred, random_state, begin_at_stage=0, monitor=None): n_samples = X.shape[0] do_subsample = self.subsample < 1.0 sample_weight = np.ones(n_samples, dtype=np.float64) n_queries = check_qids(qids) query_groups = np.array([(qid, a, b, np.arange(a, b)) for qid, a, b in get_groups(qids)], dtype=np.object) assert n_queries == len(query_groups) do_query_oob = self.query_subsample < 1.0 query_mask = np.ones(n_queries, dtype=np.bool) query_idx = np.arange(n_queries) q_inbag = max(1, int(self.query_subsample * n_queries)) if self.verbose: verbose_reporter = _VerboseReporter(self.verbose) verbose_reporter.init(self, begin_at_stage, self.n_metrics, monitor is not None) for i in range(begin_at_stage, self.n_estimators): if do_query_oob: random_state.shuffle(query_idx) query_mask = np.zeros(n_queries, dtype=np.bool) query_mask[query_idx[:q_inbag]] = 1 query_groups_to_use = query_groups[query_mask] sample_mask = np.zeros(n_samples, dtype=np.bool) for qid, a, b, sidx in query_groups_to_use: sidx_to_use = sidx if do_subsample: query_samples_inbag = max( 1, int(self.subsample * (b - 1))) random_state.shuffle(sidx) sidx_to_use = sidx[:query_samples_inbag] sample_mask[sidx_to_use] = 1 if do_query_oob: old_oob_total_score = np.zeros(self.n_metrics) for midx, metric in enumerate(self.metrics): if metric.is_ltr_metric: for qid, a, b, _ in query_groups[~query_mask]: old_oob_total_score[midx] += metric.evaluate_preds( qid, y[a:b], y_pred[a:b]) else: old_oob_total_score[midx] = metric.evaluate_preds( None, y[~sample_mask], y_pred[~sample_mask]) y_pred = self._fit_stage(i, X, y, qids, y_pred, sample_weight, sample_mask, query_groups_to_use, random_state) for midx, metric in enumerate(self.metrics): train_total_score, oob_total_score = 0.0, 0.0 if metric.is_ltr_metric: for qidx, (qid, a, b, _) in enumerate(query_groups): score = metric.evaluate_preds( qid, y[a:b], y_pred[a:b]) if query_mask[qidx]: train_total_score += score else: oob_total_score += score else: train_total_score = metric.evaluate_preds( None, y[sample_mask], y_pred[sample_mask]) oob_total_score = metric.evaluate_preds( None, y[~sample_mask], y_pred[~sample_mask]) train_normalizer = q_inbag if metric.is_ltr_metric else 1.0 oob_normalizer = n_queries - q_inbag if metric.is_ltr_metric else 1.0 self.train_score_[i, midx] = train_total_score / train_normalizer if do_query_oob: if q_inbag < n_queries: self.oob_improvement_[i, midx] = \ (oob_total_score - old_oob_total_score[midx]) / oob_normalizer early_stop = False monitor_output = None if monitor is not None: monitor_output = monitor(i, self, locals()) if monitor_output is True: early_stop = True if self.verbose > 0: verbose_reporter.update(i, self, monitor_output) if early_stop: break return i + 1
def _exact_partial_dependence(est, target_variables, grid, X, output=None): """Calculate the partial dependence of ``target_variables``. The function will be calculated by calling the ``predict_proba`` method of ``est`` for classification or ``predict`` for regression on ``X`` for every point in the grid. Parameters ---------- est : BaseEstimator A fitted classification or regression model. target_variables : array-like, dtype=int The target features for which the partial dependency should be computed (size should be smaller than 3 for visual renderings). grid : array-like, shape=(n_points, len(target_variables)) The grid of ``target_variables`` values for which the partial dependency should be evaluated (either ``grid`` or ``X`` must be specified). X : array-like, shape=(n_samples, n_features) The data on which ``est`` was trained. output : int, optional (default=None) The output index to use for multi-output estimators. Returns ------- pdp : array, shape=(n_classes, n_points) The partial dependence function evaluated on the ``grid``. For regression and binary classification ``n_classes==1``. """ n_samples = X.shape[0] pdp = [] for row in range(grid.shape[0]): X_eval = X.copy() for i, variable in enumerate(target_variables): X_eval[:, variable] = np.repeat(grid[row, i], n_samples) if est._estimator_type == 'regressor': try: pdp_row = est.predict(X_eval) except: raise ValueError('Call %s.fit before partial_dependence' % est.__class__.__name__) if pdp_row.ndim != 1 and pdp_row.shape[1] != 1: # Multi-output if not 0 <= output < pdp_row.shape[1]: raise ValueError('Valid output must be specified for ' 'multi-output models.') pdp_row = pdp_row[:, output] pdp.append(np.mean(pdp_row)) elif est._estimator_type == 'classifier': try: pdp_row = est.predict_proba(X_eval) except: raise ValueError('Call %s.fit before partial_dependence' % est.__class__.__name__) if isinstance(pdp_row, list): # Multi-output if not 0 <= output < len(pdp_row): raise ValueError('Valid output must be specified for ' 'multi-output models.') pdp_row = pdp_row[output] pdp_row = np.log(np.clip(pdp_row, 1e-16, 1)) pdp_row = np.subtract(pdp_row, np.mean(pdp_row, 1)[:, np.newaxis]) pdp.append(np.mean(pdp_row, 0)) else: raise ValueError('est must be a fitted regressor or classifier ' 'model.') pdp = np.array(pdp).transpose() if pdp.shape[0] == 2: # Binary classification pdp = pdp[1, :][np.newaxis] elif len(pdp.shape) == 1: # Regression pdp = pdp[np.newaxis] return pdp
def partial_dependence(model, target_variables, grid=None, X=None, percentiles=(0.05, 0.95), grid_resolution=100): """Partial dependence of ``target_variables``. Partial dependence plots show the dependence between the joint values of the ``target_variables`` and the function represented by the ``model``. Read more in the :ref:`User Guide <partial_dependence>`. Parameters ---------- model : BaseBoosting A fitted boosting model. target_variables : array-like, dtype=int The target features for which the partial dependecy should be computed (size should be smaller than 3 for visual renderings). grid : array-like, shape=(n_points, len(target_variables)) The grid of ``target_variables`` values for which the partial dependecy should be evaluated (either ``grid`` or ``X`` must be specified). X : array-like, shape=(n_samples, n_features) The data on which ``model`` was trained. It is used to generate a ``grid`` for the ``target_variables``. The ``grid`` comprises ``grid_resolution`` equally spaced points between the two ``percentiles``. percentiles : (low, high), default=(0.05, 0.95) The lower and upper percentile used create the extreme values for the ``grid``. Only if ``X`` is not None. grid_resolution : int, default=100 The number of equally spaced points on the ``grid``. Returns ------- pdp : array, shape=(n_classes, n_points) The partial dependence function evaluated on the ``grid``. For regression and binary classification ``n_classes==1``. axes : seq of ndarray or None The axes with which the grid has been created or None if the grid has been given. Examples -------- >>> from KTBoost.partial_dependence import partial_dependence >>> import matplotlib.pyplot as plt >>> >>> Xtrain=np.random.rand(1000,10) >>> ytrain=2*Xtrain[:,0]+2*Xtrain[:,1]+np.random.rand(1000) >>> model = KTBoost.BoostingRegressor() >>> model.fit(Xtrain,ytrain) >>> >>> kwargs = dict(X=Xtrain, percentiles=(0, 1)) >>> partial_dependence(model,[0],**kwargs) """ if not isinstance(model, BaseBoosting): raise ValueError('model has to be an instance of BaseBoosting') if not model.base_learner == "tree": raise ValueError("Partial dependencies are only " "defined for trees as base " "learners. Use option 'base_learner=\"tree\"'.") check_is_fitted(model, 'estimators_') if (grid is None and X is None) or (grid is not None and X is not None): raise ValueError('Either grid or X must be specified') target_variables = np.asarray(target_variables, dtype=np.int32, order='C').ravel() if any([not (0 <= fx < model.n_features_) for fx in target_variables]): raise ValueError('target_variables must be in [0, %d]' % (model.n_features_ - 1)) if X is not None: X = check_array(X, dtype=DTYPE, order='C') grid, axes = _grid_from_X(X[:, target_variables], percentiles, grid_resolution) else: assert grid is not None # dont return axes if grid is given axes = None # grid must be 2d if grid.ndim == 1: grid = grid[:, np.newaxis] if grid.ndim != 2: raise ValueError('grid must be 2d but is %dd' % grid.ndim) grid = np.asarray(grid, dtype=DTYPE, order='C') assert grid.shape[1] == target_variables.shape[0] n_trees_per_stage = model.estimators_.shape[1] n_estimators = model.estimators_.shape[0] pdp = np.zeros(( n_trees_per_stage, grid.shape[0], ), dtype=np.float64, order='C') for stage in range(n_estimators): for k in range(n_trees_per_stage): tree = model.estimators_[stage, k].tree_ _partial_dependence_tree(tree, grid, target_variables, model.learning_rate, pdp[k]) return pdp, axes
def _fit_stages(self, X, y, qids, y_pred, random_state, begin_at_stage=0, monitor=None): n_samples = X.shape[0] do_subsample = self.subsample < 1.0 sample_weight = np.ones(n_samples, dtype=np.float64) n_queries = check_qids(qids) query_groups = np.array([(qid, a, b, np.arange(a, b)) for qid, a, b in get_groups(qids)], dtype=np.object) assert n_queries == len(query_groups) do_query_oob = self.query_subsample < 1.0 query_mask = np.ones(n_queries, dtype=np.bool) query_idx = np.arange(n_queries) q_inbag = max(1, int(self.query_subsample * n_queries)) if self.verbose: verbose_reporter = _VerboseReporter(self.verbose) verbose_reporter.init(self, begin_at_stage, self.n_metrics, monitor is not None) for i in range(begin_at_stage, self.n_estimators): if do_query_oob: random_state.shuffle(query_idx) query_mask = np.zeros(n_queries, dtype=np.bool) query_mask[query_idx[:q_inbag]] = 1 query_groups_to_use = query_groups[query_mask] sample_mask = np.zeros(n_samples, dtype=np.bool) for qid, a, b, sidx in query_groups_to_use: sidx_to_use = sidx if do_subsample: query_samples_inbag = max(1, int(self.subsample * (b - 1))) random_state.shuffle(sidx) sidx_to_use = sidx[:query_samples_inbag] sample_mask[sidx_to_use] = 1 if do_query_oob: old_oob_total_score = np.zeros(self.n_metrics) for midx, metric in enumerate(self.metrics): if metric.is_ltr_metric: for qid, a, b, _ in query_groups[~query_mask]: old_oob_total_score[midx] += metric.evaluate_preds( qid, y[a:b], y_pred[a:b]) else: old_oob_total_score[midx] = metric.evaluate_preds( None, y[~sample_mask], y_pred[~sample_mask]) y_pred = self._fit_stage(i, X, y, qids, y_pred, sample_weight, sample_mask, query_groups_to_use, random_state) for midx, metric in enumerate(self.metrics): train_total_score, oob_total_score = 0.0, 0.0 if metric.is_ltr_metric: for qidx, (qid, a, b, _) in enumerate(query_groups): score = metric.evaluate_preds(qid, y[a:b], y_pred[a:b]) if query_mask[qidx]: train_total_score += score else: oob_total_score += score else: train_total_score = metric.evaluate_preds( None, y[sample_mask], y_pred[sample_mask]) oob_total_score = metric.evaluate_preds( None, y[~sample_mask], y_pred[~sample_mask]) train_normalizer = q_inbag if metric.is_ltr_metric else 1.0 oob_normalizer = n_queries - q_inbag if metric.is_ltr_metric else 1.0 self.train_score_[i, midx] = train_total_score / train_normalizer if do_query_oob: if q_inbag < n_queries: self.oob_improvement_[i, midx] = \ (oob_total_score - old_oob_total_score[midx]) / oob_normalizer early_stop = False monitor_output = None if monitor is not None: monitor_output = monitor(i, self, locals()) if monitor_output is True: early_stop = True if self.verbose > 0: verbose_reporter.update(i, self, monitor_output) if early_stop: break return i + 1
def calc_random_ev(self, qid, targets): total_gains = sum(self._gain_fn(t) for t in targets) total_discounts = sum(self._get_discount(i) for i in range(min(self.k, len(targets)))) return total_gains * total_discounts / len(targets)
def calc_lambdas_deltas(self, qid, targets, preds): """Returns the first and second (psuedo-)derivatives. Lambdas is the negative gradient of the loss with respect to the prediction. Deltas is the derivative of that. Parameters ---------- qid : object See `evaluate`. targets : array_like of shape = [n_targets] See `evaluate`. preds : array_like of shape = [n_targets] List of predicted scores corresponding to the targets. Returns ------- lambdas = array_like of shape = [n_targets] deltas = array_like of shape = [n_targets] """ ns = targets.shape[0] positions = get_sorted_y_positions(targets, preds, check=False) actual = targets[positions] swap_deltas = self.calc_swap_deltas(qid, actual) max_k = self.max_k() if max_k is None or ns < max_k: max_k = ns lambdas = np.zeros(ns) deltas = np.zeros(ns) for i in range(max_k): for j in range(i + 1, ns): if actual[i] == actual[j]: continue delta_metric = swap_deltas[i, j] if delta_metric == 0.0: continue a, b = positions[i], positions[j] # invariant: preds[a] >= preds[b] if actual[i] < actual[j]: assert delta_metric > 0.0 logistic = scipy.special.expit(preds[a] - preds[b]) l = logistic * delta_metric lambdas[a] -= l lambdas[b] += l else: assert delta_metric < 0.0 logistic = scipy.special.expit(preds[b] - preds[a]) l = logistic * -delta_metric lambdas[a] += l lambdas[b] -= l hess = (1 - logistic) * l deltas[a] += hess deltas[b] += hess return lambdas, deltas
def plot_partial_dependence(est, X, features, feature_names=None, target=None, n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), method='auto', n_jobs=1, verbose=0, ax=None, line_kw=None, contour_kw=None, **fig_kw): """Partial dependence plots. The ``len(features)`` plots are arranged in a grid with ``n_cols`` columns. Two-way partial dependence plots are plotted as contour plots. Read more in the :ref:`User Guide <partial_dependence>`. Parameters ---------- est : BaseEstimator A fitted classification or regression model. Classifiers must have a ``predict_proba()`` method. Multioutput-multiclass estimators aren't supported. X : array-like, shape=(n_samples, n_features) The data to use to build the grid of values on which the dependence will be evaluated. This is usually the training data. features : list of ints or strings, or tuples of ints or strings The target features for which to create the PDPs. If features[i] is an int or a string, a one-way PDP is created; if features[i] is a tuple, a two-way PDP is created. Each tuple must be of size 2. if any entry is a string, then it must be in ``feature_names``. feature_names : seq of str, shape=(n_features,) Name of each feature; feature_names[i] holds the name of the feature with index i. target : int, optional (default=None) - In a multiclass setting, specifies the class for which the PDPs should be computed. Note that for binary classification, the positive class (index 1) is always used. - In a multioutput setting, specifies the task for which the PDPs should be computed Ignored in binary classification or classical regression settings. n_cols : int, optional (default=3) The number of columns in the grid plot. grid_resolution : int, optional (default=100) The number of equally spaced points on the axes of the plots, for each target feature. percentiles : tuple of float, optional (default=(0.05, 0.95)) The lower and upper percentile used to create the extreme values for the PDP axes. method : str, optional (default='auto') The method to use to calculate the partial dependence predictions: - 'recursion' is only supported for objects inheriting from `BaseGradientBoosting`, but is more efficient in terms of speed. - 'brute' is supported for any estimator, but is more computationally intensive. - If 'auto', then 'recursion' will be used for ``BaseGradientBoosting`` estimators, and 'brute' used for other estimators. Unlike the 'brute' method, 'recursion' does not account for the ``init`` predictor of the boosting process. In practice this still produces the same plots, up to a constant offset in the target response. n_jobs : int, optional (default=1) The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. See :term:`Glossary <n_jobs>` for more details. verbose : int, optional (default=0) Verbose output during PD computations. ax : Matplotlib axis object, optional (default=None) An axis object onto which the plots will be drawn. line_kw : dict, optional Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. For one-way partial dependence plots. contour_kw : dict, optional Dict with keywords passed to the ``matplotlib.pyplot.plot`` call. For two-way partial dependence plots. **fig_kw : dict, optional Dict with keywords passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns ------- fig : figure The Matplotlib Figure object. axs : seq of Axis objects A seq of Axis objects, one for each subplot. Examples -------- >>> from sklearn.datasets import make_friedman1 >>> from sklearn.ensemble import GradientBoostingRegressor >>> X, y = make_friedman1() >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP ... """ import matplotlib.pyplot as plt from matplotlib import transforms from matplotlib.ticker import MaxNLocator from matplotlib.ticker import ScalarFormatter # set target_idx for multi-class estimators if hasattr(est, 'classes_') and np.size(est.classes_) > 2: if target is None: raise ValueError('target must be specified for multi-class') target_idx = np.searchsorted(est.classes_, target) if (not (0 <= target_idx < len(est.classes_)) or est.classes_[target_idx] != target): raise ValueError('target not in est.classes_, got {}'.format( target)) else: # regression and binary classification target_idx = 0 X = check_array(X) n_features = X.shape[1] # convert feature_names to list if feature_names is None: # if feature_names is None, use feature indices as name feature_names = [str(i) for i in range(n_features)] elif isinstance(feature_names, np.ndarray): feature_names = feature_names.tolist() def convert_feature(fx): if isinstance(fx, six.string_types): try: fx = feature_names.index(fx) except ValueError: raise ValueError('Feature %s not in feature_names' % fx) return int(fx) # convert features into a seq of int tuples tmp_features = [] for fxs in features: if isinstance(fxs, (numbers.Integral, six.string_types)): fxs = (fxs,) try: fxs = [convert_feature(fx) for fx in fxs] except TypeError: raise ValueError('Each entry in features must be either an int, ' 'a string, or an iterable of size at most 2.') if not (1 <= np.size(fxs) <= 2): raise ValueError('Each entry in features must be either an int, ' 'a string, or an iterable of size at most 2.') tmp_features.append(fxs) features = tmp_features names = [] try: for fxs in features: names_ = [] # explicit loop so "i" is bound for exception below for i in fxs: names_.append(feature_names[i]) names.append(names_) except IndexError: raise ValueError('All entries of features must be less than ' 'len(feature_names) = {0}, got {1}.' .format(len(feature_names), i)) # compute averaged predictions pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(partial_dependence)(est, fxs, X=X, method=method, grid_resolution=grid_resolution, percentiles=percentiles) for fxs in features) # For multioutput regression, we can only check the validity of target # now that we have the predictions. # Also note: as multiclass-multioutput classifiers are not supported, # multiclass and multioutput scenario are mutually exclusive. So there is # no risk of overwriting target_idx here. pd, _ = pd_result[0] # checking the first result is enough if is_regressor(est) and pd.shape[0] > 1: if target is None: raise ValueError( 'target must be specified for multi-output regressors') if not 0 <= target <= pd.shape[0]: raise ValueError( 'target must be in [0, n_tasks], got {}.'.format( target)) target_idx = target else: target_idx = 0 # get global min and max values of PD grouped by plot type pdp_lim = {} for pd, values in pd_result: min_pd, max_pd = pd[target_idx].min(), pd[target_idx].max() n_fx = len(values) old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) min_pd = min(min_pd, old_min_pd) max_pd = max(max_pd, old_max_pd) pdp_lim[n_fx] = (min_pd, max_pd) # create contour levels for two-way plots if 2 in pdp_lim: Z_level = np.linspace(*pdp_lim[2], num=8) if ax is None: fig = plt.figure(**fig_kw) else: fig = ax.get_figure() fig.clear() if line_kw is None: line_kw = {'color': 'green'} if contour_kw is None: contour_kw = {} n_cols = min(n_cols, len(features)) n_rows = int(np.ceil(len(features) / float(n_cols))) axs = [] for i, fx, name, (pd, values) in zip(count(), features, names, pd_result): ax = fig.add_subplot(n_rows, n_cols, i + 1) if len(values) == 1: ax.plot(values[0], pd[target_idx].ravel(), **line_kw) else: # make contour plot assert len(values) == 2 XX, YY = np.meshgrid(values[0], values[1]) Z = pd[target_idx].reshape(list(map(np.size, values))).T CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k') ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], alpha=0.75, **contour_kw) ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) # plot data deciles + axes labels deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) ylim = ax.get_ylim() ax.vlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_xlabel(name[0]) ax.set_ylim(ylim) # prevent x-axis ticks from overlapping ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) tick_formatter = ScalarFormatter() tick_formatter.set_powerlimits((-3, 4)) ax.xaxis.set_major_formatter(tick_formatter) if len(values) > 1: # two-way PDP - y-axis deciles + labels deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory(ax.transAxes, ax.transData) xlim = ax.get_xlim() ax.hlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_ylabel(name[1]) # hline erases xlim ax.set_xlim(xlim) else: ax.set_ylabel('Partial dependence') if len(values) == 1: ax.set_ylim(pdp_lim[1]) axs.append(ax) fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, hspace=0.3) return fig, axs
def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrix can be CSC, CSR, COO, DOK, or LIL. COO, DOK, and LIL are converted to CSR. The dtype is forced to DTYPE from tree._tree if the base classifier of this ensemble weighted boosting classifier is a tree or forest. y : array-like of shape = [n_samples] The target values (class labels in classification, real numbers in regression). sample_weight : array-like of shape = [n_samples], optional Sample weights. If None, the sample weights are initialized to 1 / n_samples. Returns ------- self : object Returns self. """ # Check parameters if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") if (self.base_estimator is None or isinstance(self.base_estimator, (BaseDecisionTree, BaseForest))): dtype = DTYPE accept_sparse = 'csc' else: dtype = None accept_sparse = ['csr', 'csc'] X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=dtype, y_numeric=is_regressor(self)) if sample_weight is None: # Initialize weights to 1 / n_samples sample_weight = np.empty(X.shape[0], dtype=np.float64) sample_weight[:] = 1. / X.shape[0] else: sample_weight = check_array(sample_weight, ensure_2d=False) # Normalize existing weights sample_weight = sample_weight / sample_weight.sum(dtype=np.float64) # Check that the sample weights sum is positive if sample_weight.sum() <= 0: raise ValueError( "Attempting to fit with a non-positive " "weighted number of samples.") # Check parameters self._validate_estimator() # Clear any previous fit results self.estimators_ = [] self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64) self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64) random_state = check_random_state(self.random_state) for iboost in range(self.n_estimators): # Boosting step sample_weight, estimator_weight, estimator_error = self._boost( iboost, X, y, sample_weight, random_state) # Early termination if sample_weight is None: break self.estimator_weights_[iboost] = estimator_weight self.estimator_errors_[iboost] = estimator_error # Stop if error is zero if estimator_error == 0: break sample_weight_sum = np.sum(sample_weight) # Stop if the sum of sample weights has become non-positive if sample_weight_sum <= 0: break if iboost < self.n_estimators - 1: # Normalize sample_weight /= sample_weight_sum return self
def partial_dependence(est, target_variables, grid=None, X=None, output=None, percentiles=(0.05, 0.95), grid_resolution=100, method=None): """Partial dependence of ``target_variables``. Partial dependence plots show the dependence between the joint values of the ``target_variables`` and the function represented by the ``est``. Read more in the :ref:`User Guide <partial_dependence>`. Parameters ---------- est : BaseEstimator A fitted classification or regression model. target_variables : array-like, dtype=int The target features for which the partial dependency should be computed (size should be smaller than 3 for visual renderings). grid : array-like, shape=(n_points, len(target_variables)) The grid of ``target_variables`` values for which the partial dependency should be evaluated (either ``grid`` or ``X`` must be specified). X : array-like, shape=(n_samples, n_features) The data on which ``est`` was trained. It is used to generate a ``grid`` for the ``target_variables``. The ``grid`` comprises ``grid_resolution`` equally spaced points between the two ``percentiles``. output : int, optional (default=None) The output index to use for multi-output estimators. percentiles : (low, high), default=(0.05, 0.95) The lower and upper percentile used create the extreme values for the ``grid``. Only if ``X`` is not None. grid_resolution : int, default=100 The number of equally spaced points on the ``grid``. method : {'recursion', 'exact', 'estimated', None}, optional (default=None) The method to use to calculate the partial dependence function: - If 'recursion', the underlying trees of ``est`` will be recursed to calculate the function. Only supported for BaseGradientBoosting and ForestRegressor. - If 'exact', the function will be calculated by calling the ``predict_proba`` method of ``est`` for classification or ``predict`` for regression on ``X``for every point in the grid. To speed up this method, you can use a subset of ``X`` or a more coarse grid. - If 'estimated', the function will be calculated by calling the ``predict_proba`` method of ``est`` for classification or ``predict`` for regression on the mean of ``X``. - If None, then 'recursion' will be used if ``est`` is BaseGradientBoosting or ForestRegressor, and 'exact' used for other estimators. Returns ------- pdp : array, shape=(n_classes, n_points) The partial dependence function evaluated on the ``grid``. For regression and binary classification ``n_classes==1``. axes : seq of ndarray or None The axes with which the grid has been created or None if the grid has been given. Examples -------- >>> samples = [[0, 0, 2], [1, 0, 0]] >>> labels = [0, 1] >>> from sklearn.ensemble import GradientBoostingClassifier >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels) >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2) >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP (array([[-4.52..., 4.52...]]), [array([ 0., 1.])]) """ if method is None: if isinstance(est, (BaseGradientBoosting, ForestRegressor)): method = 'recursion' else: method = 'exact' if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and method == 'recursion'): raise ValueError('est has to be an instance of BaseGradientBoosting or' ' ForestRegressor for the "recursion" method. Try ' 'using method="exact" or "estimated".') if (not hasattr(est, '_estimator_type') or est._estimator_type not in ('classifier', 'regressor')): raise ValueError('est must be a fitted regressor or classifier model.') # if method != 'recursion' and est._estimator_type == 'classifier': # raise ValueError('est requires a predict_proba method for ' # 'method="exact" or "estimated" for classification.') if method == 'recursion': if len(est.estimators_) == 0: raise ValueError('Call %s.fit before partial_dependence' % est.__class__.__name__) if isinstance(est, BaseGradientBoosting): n_features = est.n_features else: n_features = est.n_features_ elif X is None: raise ValueError('X is required for method="exact" or "estimated".') else: n_features = X.shape[1] if (grid is None and X is None) or (grid is not None and X is not None): raise ValueError('Either grid or X must be specified') target_variables = np.asarray(target_variables, dtype=np.int32, order='C').ravel() if any([not (0 <= fx < n_features) for fx in target_variables]): raise ValueError('target_variables must be in [0, %d]' % (n_features - 1)) if X is not None: X = check_array(X, dtype=DTYPE, order='C') grid, axes = _grid_from_X(X[:, target_variables], percentiles, grid_resolution) else: assert grid is not None # don't return axes if grid is given axes = None # grid must be 2d if grid.ndim == 1: grid = grid[:, np.newaxis] if grid.ndim != 2: raise ValueError('grid must be 2d but is %dd' % grid.ndim) grid = np.asarray(grid, dtype=DTYPE, order='C') assert grid.shape[1] == target_variables.shape[0] if method == 'recursion': if isinstance(est, BaseGradientBoosting): n_trees_per_stage = est.estimators_.shape[1] n_estimators = est.estimators_.shape[0] learning_rate = est.learning_rate else: n_trees_per_stage = 1 n_estimators = len(est.estimators_) learning_rate = 1. pdp = np.zeros(( n_trees_per_stage, grid.shape[0], ), dtype=np.float64, order='C') for stage in range(n_estimators): for k in range(n_trees_per_stage): if isinstance(est, BaseGradientBoosting): tree = est.estimators_[stage, k].tree_ else: tree = est.estimators_[stage].tree_ _partial_dependence_tree(tree, grid, target_variables, learning_rate, pdp[k]) if isinstance(est, ForestRegressor): pdp /= n_estimators elif method == 'exact': pdp = _exact_partial_dependence(est, target_variables, grid, X, output) elif method == 'estimated': pdp = _estimated_partial_dependence(est, target_variables, grid, X, output) else: raise ValueError('method "%s" is invalid. Use "recursion", "exact", ' '"estimated", or None.' % method) return pdp, axes