def fit(self, original, target, original_weight=None, target_weight=None): """ Prepare reweighting formula by computing histograms. :param original: values from original distribution, array-like of shape [n_samples, n_features] :param target: values from target distribution, array-like of shape [n_samples, n_features] :param original_weight: weights for samples of original distributions :param target_weight: weights for samples of original distributions :return: self """ self.n_features_ = None original, original_weight = self._normalize_input(original, original_weight) target, target_weight = self._normalize_input(target, target_weight) target_perc = numpy.linspace(0, 1, self.n_percentiles + 1)[1:-1] self.edges = [] for axis in range(self.n_features_): self.edges.append(weighted_quantile(target[:, axis], quantiles=target_perc, sample_weight=target_weight)) bins_weights = [] for data, weights in [(original, original_weight), (target, target_weight)]: bin_indices = self.compute_bin_indices(data) bin_w = bincount_nd(bin_indices, weights=weights, shape=[self.n_percentiles] * self.n_features_) smeared_weights = gaussian_filter(bin_w, sigma=self.n_neighs, truncate=2.5) bins_weights.append(smeared_weights.clip(self.min_in_the_bin)) bin_orig_weights, bin_targ_weights = bins_weights self.transition = bin_targ_weights / bin_orig_weights return self
def check_weighted_percentile(size=100, q_size=20): random = RandomState() array = random.permutation(size) quantiles = random.uniform(size=q_size) q_permutation = random.permutation(q_size) result1 = weighted_quantile(array, quantiles)[q_permutation] result2 = weighted_quantile(array, quantiles[q_permutation]) result3 = weighted_quantile(array[random.permutation(size)], quantiles[q_permutation]) assert numpy.all(result1 == result2) and numpy.all(result1 == result3), 'breaks on permutations' # checks that order is kept quantiles = numpy.linspace(0, 1, size * 3) x = weighted_quantile(array, quantiles, sample_weight=random.exponential(size=size)) assert numpy.all(x == numpy.sort(x)), "doesn't preserve order" array = numpy.array([0, 1, 2, 5]) # comparing with simple percentiles for x in random.uniform(size=10): assert numpy.abs(numpy.percentile(array, x * 100) - weighted_quantile(array, x, old_style=True)) < 1e-7, \ "doesn't coincide with numpy.percentile"
def check_weighted_percentile(size=100, q_size=20): random = RandomState() array = random.permutation(size) quantiles = random.uniform(size=q_size) q_permutation = random.permutation(q_size) result1 = weighted_quantile(array, quantiles)[q_permutation] result2 = weighted_quantile(array, quantiles[q_permutation]) result3 = weighted_quantile(array[random.permutation(size)], quantiles[q_permutation]) assert numpy.all(result1 == result2) and numpy.all( result1 == result3), 'breaks on permutations' # checks that order is kept quantiles = numpy.linspace(0, 1, size * 3) x = weighted_quantile(array, quantiles, sample_weight=random.exponential(size=size)) assert numpy.all(x == numpy.sort(x)), "doesn't preserve order" array = numpy.array([0, 1, 2, 5]) # comparing with simple percentiles for x in random.uniform(size=10): assert numpy.abs(numpy.percentile(array, x * 100) - weighted_quantile(array, x, old_style=True)) < 1e-7, \ "doesn't coincide with numpy.percentile"
def fit(self, original, target, original_weight=None, target_weight=None): """ Prepare reweighting formula by computing histograms. :param original: values from original distribution, array-like of shape [n_samples, n_features] :param target: values from target distribution, array-like of shape [n_samples, n_features] :param original_weight: weights for samples of original distributions :param target_weight: weights for samples of original distributions :return: self """ self.n_features_ = None original, original_weight = self._normalize_input( original, original_weight) target, target_weight = self._normalize_input(target, target_weight) target_perc = numpy.linspace(0, 1, self.n_percentiles + 1)[1:-1] self.edges = [] for axis in range(self.n_features_): self.edges.append( weighted_quantile(target[:, axis], quantiles=target_perc, sample_weight=target_weight)) bins_weights = [] for data, weights in [(original, original_weight), (target, target_weight)]: bin_indices = self.compute_bin_indices(data) bin_w = bincount_nd(bin_indices, weights=weights, shape=[self.n_percentiles] * self.n_features_) smeared_weights = gaussian_filter(bin_w, sigma=self.n_neighs, truncate=2.5) bins_weights.append(smeared_weights.clip(self.min_in_the_bin)) bin_orig_weights, bin_targ_weights = bins_weights self.transition = bin_targ_weights / bin_orig_weights return self