def _recompute_centers( X, labels, n_clusters): """ Computation of cluster centers / means. Parameters ---------- X: array-like, shape (n_samples, n_features) labels: array of integers, shape (n_samples) Current label assignment n_clusters: int Number of desired clusters Returns ------- centers: array, shape (n_clusters, n_features) The resulting centers """ n_samples = X.shape[0] n_features = X.shape[1] # Initialize centers to all zero centers = np.zeros((n_clusters, n_features)) n_samples_in_cluster = bincount(labels, minlength=n_clusters) # Compute a center for each label # For each label, average over samples and features # TODO: IMPLEMENT # Take all of the samples in a cluster and average their features return centers
def check_min_weight_fraction_leaf(name): X, y = hastie_X, hastie_y # Test if leaves contain at least min_weight_fraction_leaf of the # training set ForestEstimator = FOREST_ESTIMATORS[name] rng = np.random.RandomState(0) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for frac in np.linspace(0, 0.5, 6): est = ForestEstimator(min_weight_fraction_leaf=frac, n_estimators=1, random_state=0) if "RandomForest" in name: est.bootstrap = False est.fit(X, y, sample_weight=weights) out = est.estimators_[0].tree_.apply(X) node_weights = bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert_greater_equal( np.min(leaf_weights), total_weight * est.min_weight_fraction_leaf, "Failed with {0} " "min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf))
def fit(self, X): """ - It should train the BM25 model on the given corpus docs - Return nothing """ X = self.tf_vectorizer.fit_transform(X).toarray() if not sp.issparse(X): X = sp.csc_matrix(X) n_samples, n_features = X.shape if sp.isspmatrix_csr(X): df = bincount(X.indices, minlength=X.shape[1]) else: df = np.diff(sp.csc_matrix(X, copy=False).indptr) #compute idf weight #idf = np.log((float(n_samples)-df+0.5)/(df+0.5)) idf = np.log(float(n_samples) / df) + 1.0 self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') #compute the length for each document and average length of the corpus doc_len = np.sum(X, axis=1) self._doc_len = np.reshape(doc_len, (n_samples, 1)) self._avgdl = np.sum(X) / n_samples
def _make_test_folds(self, X, y=None, groups=None): if self.shuffle: rng = check_random_state(self.random_state) else: rng = self.random_state y = np.asarray(y) n_samlples = len(X) y = ','.join(y).split(',') unique_y, y_inversed = np.unique(y, return_inverse=True) y_counts = bincount(y_inversed) min_groups = np.min(y_counts) if np.all(self.n_splits > y_counts): raise ValueError("All the n_groups for individual classes" " are less than n_splits=%d." % (self.n_splits)) if self.n_splits > min_groups: warnings.warn(("The least populated class in y has only %d" " members, which is too few. The minimum" " number of groups for any class cannot" " be less than n_splits=%d." % (min_groups, self.n_splits)), Warning) # pre-assign each sample to a test fold index using individual KFold # splitting strategies for each class so as to respect the balance of # classes # NOTE: Passing the data corresponding to ith class say X[y==class_i] # will break when the data is not 100% stratifiable for all classes. # So we pass np.zeroes(max(c, n_splits)) as data to the KFold test_folds = iterative_stratification(X, set(y), self.n_splits, rng) return test_folds
def check_min_weight_fraction_leaf(name, X, y): # Test if leaves contain at least min_weight_fraction_leaf of the # training set ForestEstimator = FOREST_ESTIMATORS[name] rng = np.random.RandomState(0) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes in (None, 1000): for frac in np.linspace(0, 0.5, 6): est = ForestEstimator(min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0) if isinstance(est, (RandomForestClassifier, RandomForestRegressor)): est.bootstrap = False est.fit(X, y, sample_weight=weights) out = est.estimators_[0].tree_.apply(X) node_weights = bincount(out, weights=weights) # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert_greater_equal( np.min(leaf_weights), total_weight * est.min_weight_fraction_leaf, "Failed with {0} " "min_weight_fraction_leaf={1}".format( name, est.min_weight_fraction_leaf))
def transform(self, X, y=None): n = X.shape[0] # check if column is in X if self.var not in X.columns: ValueError("X does not contain variable {} ".format(self.var)) # check if column is categorical, if not categorize classes, y_indices = np.unique(y, return_inverse=True) class_counts = bincount(y_indices) if np.min(class_counts) < 2: X['%s__cat' % self.var] = pd.cut(X[self.var], self.nbins).cat.codes var = '%s__cat' % self.var else: var = self.var # compute budget per bin if isinstance(self.sample_size, int): budget_per_bin = int(self.sample_size / self.nbins) else: budget_per_bin = int(self.sample_size / self.nbins * X.shape[0]) obs_by_cat = X.groupby(var).count().loc[:, self.var] keep_indexes = [] for i in obs_by_cat.index: if obs_by_cat.iloc[i] <= budget_per_bin: keep_indexes += list(np.where(X[var] == i)[0]) else: keep_indexes += random.sample(list(np.where(X[var] == i)[0]), budget_per_bin) return X.iloc[keep_indexes]
def check_min_samples_leaf(name): X, y = hastie_X, hastie_y # Test if leaves contain more than leaf_count training examples ForestEstimator = FOREST_ESTIMATORS[name] # test boundary value assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y) assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y) est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name)) est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), len(X) * 0.25 - 1, "Failed with {0}".format(name))
def transform(self, X, y=None): # n = X.shape[0] # check if column is in X if self.var not in X.columns: ValueError("X does not contain variable {} ".format(self.var)) # check if column is categorical, if not categorize classes, y_indices = np.unique(y, return_inverse=True) class_counts = bincount(y_indices) if np.min(class_counts) < 2: X['%s__cat' % self.var] = pd.cut(X[self.var], self.nbins).cat.codes var = '%s__cat' % self.var else: var = self.var split = StratifiedShuffleSplit(n_splits=1, train_size=self.sample_size, random_state=self.random_state) for train_index, test_index in split.split(X, X[var]): XX = X.iloc[train_index, :] # XX = X.loc[test_index] if '%s__cat' % self.var in X.columns: XX.drop('%s__cat' % self.var, axis=1, inplace=True) return XX
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = np.ones((n_samples,), dtype=np.float64) else: curr_sample_weight = sample_weight.copy() indices = _generate_sample_indices(tree.random_state, n_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts if class_weight == 'subsample': with warnings.catch_warnings(): warnings.simplefilter('ignore', DeprecationWarning) curr_sample_weight *= compute_sample_weight('auto', y, indices) elif class_weight == 'balanced_subsample': curr_sample_weight *= compute_sample_weight('balanced', y, indices) tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) else: tree.fit(X, y, sample_weight=sample_weight, check_input=False) return tree
def _parallel_build_trees(n_trees, forest, X, y, sample_weight, seeds, verbose): """Private function used to build a batch of trees within a job.""" trees = [] for i in range(n_trees): random_state = check_random_state(seeds[i]) if verbose > 1: print("building tree %d of %d" % (i + 1, n_trees)) seed = random_state.randint(MAX_INT) tree = forest._make_estimator(append=False) tree.set_params(random_state=check_random_state(seed)) if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = np.ones((n_samples, ), dtype=np.float64) else: curr_sample_weight = sample_weight.copy() indices = random_state.randint(0, n_samples, n_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) tree.indices_ = sample_counts > 0. else: tree.fit(X, y, sample_weight=sample_weight, check_input=False) trees.append(tree) return trees
def _generate_unsampled_indices(random_state, n_samples): '''Samples out of bag''' sample_indices = _generate_sample_indices(random_state, n_samples) sample_counts = bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) unsampled_indices = indices_range[unsampled_mask] return unsampled_indices
def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose): """Private function used to build a batch of trees within a job""" from sklearn.utils import check_random_state from sklearn.utils.fixes import bincount import random MAX_INT = numpy.iinfo(numpy.int32).max random_state = check_random_state(seed) trees = [] for i in xrange(n_trees): if verbose > 1: print("building tree %d of %d" % (i + 1, n_trees)) seed = random_state.randint(MAX_INT) tree = forest._make_estimator(append=False) tree.set_params(compute_importances=forest.compute_importances) tree.set_params(random_state=check_random_state(seed)) if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = numpy.ones((n_samples, ), dtype=numpy.float64) else: curr_sample_weight = sample_weight.copy() ty = list(enumerate(y)) indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0] indices2 = random_state.randint(0, len(indices), len(indices)) indices = [indices[j] for j in indices2] sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts curr_sample_mask = sample_mask.copy() curr_sample_mask[sample_counts == 0] = False tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False) tree.indices = curr_sample_mask else: tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False) trees.append(tree) return trees
def _generate_unsampled_indices(random_state, n_samples): """Private function used to forest._set_oob_score function.""" sample_indices = _generate_sample_indices(random_state, n_samples) sample_counts = bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) unsampled_indices = indices_range[unsampled_mask] return unsampled_indices
def entropy(samples): n_samples = len(samples) entropy = 0. for count in bincount(samples): p = 1. * count / n_samples if p > 0: entropy -= p * np.log2(p) return entropy
def compute_class_weight(labelPath): with open(labelPath, 'r') as text_file: content = text_file.readlines() content = np.asarray(content) y = np.asarray( [int(sample.split(' ')[2].strip('\n')) for sample in content]) classes = np.asarray(list(set(y))) le = LabelEncoder() y_ind = le.fit_transform(y) recip_freq = len(y) / (len(le.classes_) * bincount(y_ind).astype(np.float64)) weight = recip_freq[le.transform(classes)] return weight
def test_sample_weight(): """Check sample weighting.""" # Test that zero-weighted samples are not taken into account X = np.arange(100)[:, np.newaxis] y = np.ones(100) y[:50] = 0.0 sample_weight = np.ones(100) sample_weight[y == 0] = 0.0 clf = DecisionTreeClassifier(random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert_array_equal(clf.predict(X), np.ones(100)) # Test that low weighted samples are not taken into account at low depth X = np.arange(200)[:, np.newaxis] y = np.zeros(200) y[50:100] = 1 y[100:200] = 2 X[100:200, 0] = 200 sample_weight = np.ones(200) sample_weight[y == 2] = .51 # Samples of class '2' are still weightier clf = DecisionTreeClassifier(max_depth=1, random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert_equal(clf.tree_.threshold[0], 149.5) sample_weight[y == 2] = .50 # Samples of class '2' are no longer weightier clf = DecisionTreeClassifier(max_depth=1, random_state=0) clf.fit(X, y, sample_weight=sample_weight) assert_equal(clf.tree_.threshold[0], 49.5) # Threshold should have moved # Test that sample weighting is the same as having duplicates X = iris.data y = iris.target duplicates = rng.randint(0, X.shape[0], 200) clf = DecisionTreeClassifier(random_state=1) clf.fit(X[duplicates], y[duplicates]) sample_weight = bincount(duplicates, minlength=X.shape[0]) clf2 = DecisionTreeClassifier(random_state=1) clf2.fit(X, y, sample_weight=sample_weight) internal = clf.tree_.children_left != tree._tree.TREE_LEAF assert_array_almost_equal(clf.tree_.threshold[internal], clf2.tree_.threshold[internal])
def check_min_samples_leaf(name, X, y): # Test if leaves contain more than leaf_count training examples ForestEstimator = FOREST_ESTIMATORS[name] # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for max_leaf_nodes in (None, 1000): est = ForestEstimator(min_samples_leaf=5, max_leaf_nodes=max_leaf_nodes, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name))
def _recompute_centers( X, labels, n_clusters): """ Computation of cluster centers / means. Parameters ---------- X: array-like, shape (n_samples, n_features) labels: array of integers, shape (n_samples) Current label assignment n_clusters: int Number of desired clusters Returns ------- centers: array, shape (n_clusters, n_features) The resulting centers """ n_samples = X.shape[0] n_features = X.shape[1] # Initialize centers to all zero centers = np.zeros((n_clusters, n_features)) n_samples_in_cluster = bincount(labels, minlength=n_clusters) # Compute a center for each label # For each label, average over samples and features #TODO: IMPLEMENT # Take all of the samples in a cluster and add their features # For each sample # What label is it? Let's say its label x # Add feature i to label X's feature value i for sample_idx in xrange(n_samples): label = labels[sample_idx] centers[label] += X[sample_idx] #for j in xrange(n_features): # centers[label[j]] +=X[sample_idx[j]] # Normalize by the size of the cluster centers /= n_samples_in_cluster[:, np.newaxis] return centers
def _recompute_centers(X, labels, n_clusters): """ Computation of cluster centers / means. Parameters ---------- X: array-like, shape (n_samples, n_features) labels: array of integers, shape (n_samples) Current label assignment n_clusters: int Number of desired clusters Returns ------- centers: array, shape (n_clusters, n_features) The resulting centers """ n_samples = X.shape[0] n_features = X.shape[1] # Initialize centers to all zero centers = np.zeros((n_clusters, n_features)) n_samples_in_cluster = bincount(labels, minlength=n_clusters) # Compute a center for each label # For each label, average over samples and features #TODO: IMPLEMENT # For each sample for sample_idx in xrange(n_samples): # What label is it? Let's say its label is 'label' label = labels[sample_idx] # Add feature i to label X's feature value i centers[label] += X[sample_idx] # for j in xrange(n_features): # centers[label][j] += X[sample_idx][j] # Normalize by the size of the cluster centers /= n_samples_in_cluster[:, np.newaxis] return centers
def fit(self, X, y, tol=None): """Fit the model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array, shape = [n_samples] Target values (integers) """ # X, y = check_X_y(X, y) if type_of_target(y) not in ['binary', 'multiclass']: raise ValueError("Unknown label type: %r" % type_of_target(y)) self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape n_classes = len(self.classes_) if n_classes < 2: raise ValueError('y has less than 2 classes') self.startprob_ = (bincount(y)+1.0) / (len(y)+n_classes) transmat = np.zeros((n_classes,n_classes)) for i in xrange(len(y)-1): transmat[y[i],y[i+1]] = transmat[y[i],y[i+1]] + 1 transmat = (transmat.transpose() / np.sum(transmat,1)).transpose() self.transmat_ = transmat pseudo_rows = np.tile(self.pseudo_rssi_list,(X.shape[1],1)).transpose() means = [] covars = [] miss_probs = [] for cl in xrange(n_classes): X_cl = np.concatenate((X[y == cl, :],pseudo_rows),0) miss_probs_cl = np.mean(np.isnan(X_cl),0) mean_cl = np.nanmean(X_cl,0) covar_cl = np.diag(np.nanvar(X_cl,0,ddof=1)) miss_probs.append(miss_probs_cl) means.append(mean_cl) covars.append(covar_cl) self.miss_probs_ = np.asarray(miss_probs) self.means_ = np.asarray(means) self.covars_ = np.asarray(covars) return self
def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose): """Private function used to build a batch of trees within a job""" from sklearn.utils import check_random_state from sklearn.utils.fixes import bincount import random MAX_INT = numpy.iinfo(numpy.int32).max random_state = check_random_state(seed) trees = [] for i in xrange(n_trees): if verbose > 1: print("building tree %d of %d" % (i+1, n_trees)) seed = random_state.randint(MAX_INT) tree = forest._make_estimator(append = False) tree.set_params(compute_importances=forest.compute_importances) tree.set_params(random_state = check_random_state(seed)) if forest.bootstrap: n_samples = X.shape[0] if sample_weight is None: curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64) else: curr_sample_weight = sample_weight.copy() ty = list(enumerate(y)) indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0] indices2 = random_state.randint(0, len(indices), len(indices)) indices = [indices[j] for j in indices2] sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts curr_sample_mask = sample_mask.copy() curr_sample_mask[sample_counts==0] = False tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False) tree.indices = curr_sample_mask else: tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False) trees.append(tree) return trees
def _iter_indices(self): rng = np.random.RandomState(self.random_state) cls_count = bincount(self.y_indices) for n in range(self.n_iter): train = [] test = [] for i, cls in enumerate(self.classes): sample_size = int(cls_count[i]*(1-self.test_size)) randint = rng.randint(cls_count[i], size=sample_size) aidx = np.where((self.y == cls))[0] iidx = aidx[randint] oidx = aidx[list(set(range(cls_count[i])).difference(set(randint)))] train.extend(iidx) test.extend(oidx) train = rng.permutation(train) test = rng.permutation(test) yield train, test
def _recompute_centers( X, labels, n_clusters): """ Computation of cluster centers / means. Parameters ---------- X: array-like, shape (n_samples, n_features) labels: array of integers, shape (n_samples) Current label assignment n_clusters: int Number of desired clusters Returns ------- centers: array, shape (n_clusters, n_features) The resulting centers """ n_samples = X.shape[0] n_features = X.shape[1] # Initialize centers to all zero centers = np.zeros((n_clusters, n_features)) n_samples_in_cluster = bincount(labels, minlength=n_clusters) # Compute a center for each label # For each label, average over samples and features #TODO: IMPLEMENT # 1. For each sample # 2. What label is it? Let's say its label is 'label' # 3. Add feature X's feature i to centers[label] feature value i # Normalize by the size of the cluster centers /= n_samples_in_cluster[:, np.newaxis] return centers
def _recompute_centers( X, labels, n_clusters): """ Computation of cluster centers / means. Parameters ---------- X: array-like, shape (n_samples, n_features) labels: array of integers, shape (n_samples) Current label assignment n_clusters: int Number of desired clusters Returns ------- centers: array, shape (n_clusters, n_features) The resulting centers """ n_samples = X.shape[0] n_features = X.shape[1] # Initialize centers to all zero centers = np.zeros((n_clusters, n_features)) n_samples_in_cluster = bincount(labels, minlength=n_clusters) # Compute a center for each label # For each label, average over samples and features #TODO: IMPLEMENT for i in range(n_samples): for j in range(n_features): centers[labels[i], j] += X[i, j] # Normalize by the size of the cluster centers /= n_samples_in_cluster[:, np.newaxis] return centers
def _prefit(self, X, y): '''Doc String''' X, y = check_X_y(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape n_classes = len(self.classes_) if n_classes < 2: raise ValueError('y has less than 2 classes') if self.priors is None: self.priors_ = bincount(y) / float(n_samples) else: self.priors_ = self.priors if (self.priors_ < 0).any(): raise ValueError("priors must be non-negative") if self.priors_.sum() != 1: warnings.warn("The priors do not sum to 1. Renormalizing", UserWarning) self.priors_ = self.priors_ / self.priors_.sum() return X, y
def _iter_indices(self): rng = np.random.RandomState(self.random_state) cls_count = bincount(self.y_indices) for n in range(self.n_iter): train = [] test = [] for i, cls in enumerate(self.classes): sample_size = int(cls_count[i] * (1 - self.test_size)) randint = rng.randint(cls_count[i], size=sample_size) aidx = np.where((self.y == cls))[0] iidx = aidx[randint] oidx = aidx[list( set(range(cls_count[i])).difference(set(randint)))] train.extend(iidx) test.extend(oidx) train = rng.permutation(train) test = rng.permutation(test) yield train, test
def sensitivity_specificity_support(y_true, y_pred, labels=None, pos_label=1, average=None, warn_for=('sensitivity', 'specificity'), sample_weight=None): """Compute sensitivity, specificity, and support for each class The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity quantifies the ability to avoid false negatives_[1]. The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number of true negatives and ``fn`` the number of false negatives. The specificity quantifies the ability to avoid false positives_[1]. The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function returns the average sensitivity and specificity if ``average`` is one of ``'weighted'``. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default=None) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : ndarray, shape (n_samples, ) Sample weights. Returns ------- sensitivity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) specificity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) support : int (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) The number of occurrences of each label in ``y_true``. Examples -------- >>> import numpy as np >>> from imblearn.metrics import sensitivity_specificity_support >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) >>> sensitivity_specificity_support(y_true, y_pred, average='macro') (0.33333333333333331, 0.66666666666666663, None) >>> sensitivity_specificity_support(y_true, y_pred, average='micro') (0.33333333333333331, 0.66666666666666663, None) >>> sensitivity_specificity_support(y_true, y_pred, average='weighted') (0.33333333333333331, 0.66666666666666663, None) References ---------- .. [1] `Wikipedia entry for the Sensitivity and specificity <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_ """ average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average not in average_options and average != 'binary': raise ValueError('average has to be one of ' + str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) if average == 'binary': if y_type == 'binary': if pos_label not in present_labels: if len(present_labels) < 2: # Only negative labels return (0., 0., 0) else: raise ValueError("pos_label=%r is not a valid label: %r" % (pos_label, present_labels)) labels = [pos_label] else: raise ValueError("Target is %s but average='binary'. Please " "choose another average setting." % y_type) elif pos_label not in (None, 1): warnings.warn("Note that pos_label (set to %r) is ignored when " "average != 'binary' (got %r). You may use " "labels=[pos_label] to specify a single positive class." % (pos_label, average), UserWarning) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack( [labels, np.setdiff1d( present_labels, labels, assume_unique=True)]) # Calculate tp_sum, pred_sum, true_sum ### if y_type.startswith('multilabel'): raise ValueError('imblearn does not support multilabel') elif average == 'samples': raise ValueError("Sample-based precision, recall, fscore is " "not meaningful outside multilabel " "classification. See the accuracy_score instead.") else: le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = bincount( tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): pred_sum = bincount( y_pred, weights=sample_weight, minlength=len(labels)) if len(y_true): true_sum = bincount( y_true, weights=sample_weight, minlength=len(labels)) # Compute the true negative tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] pred_sum = pred_sum[indices] tn_sum = tn_sum[indices] if average == 'micro': tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) true_sum = np.array([true_sum.sum()]) tn_sum = np.array([tn_sum.sum()]) # Finally, we have all our sufficient statistics. Divide! # with np.errstate(divide='ignore', invalid='ignore'): # Divide, and on zero-division, set scores to 0 and warn: # Oddly, we may get an "invalid" rather than a "divide" error # here. specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum, 'specificity', 'predicted', average, warn_for) sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true', average, warn_for) # Average the results if average == 'weighted': weights = true_sum if weights.sum() == 0: return 0, 0, None elif average == 'samples': weights = sample_weight else: weights = None if average is not None: assert average != 'binary' or len(specificity) == 1 specificity = np.average(specificity, weights=weights) sensitivity = np.average(sensitivity, weights=weights) true_sum = None # return no support return sensitivity, specificity, true_sum
# -*- coding: utf-8 -*-
def _document_frequency(X): """Count the number of non-zero values for each feature in sparse X.""" if sp.isspmatrix_csr(X): return bincount(X.indices, minlength=X.shape[1]) else: return np.diff(sp.csc_matrix(X, copy=False).indptr)
def _iter_indices(self, frame, y): """Iterate the indices with stratification. Parameters ---------- frame : H2OFrame The frame to split y : string The column to stratify. Returns ------- train : np.ndarray, shape=(n_samples,) The train indices test : np.ndarray, shape=(n_samples,) The test indices """ n_samples = frame.shape[0] n_train, n_test = _validate_shuffle_split(n_samples, self.test_size, self.train_size) # need to validate y... y = _val_y(y) target = np.asarray(frame[y].as_data_frame(use_pandas=True)[y].tolist()) classes, y_indices = np.unique(target, return_inverse=True) n_classes = classes.shape[0] class_counts = bincount(y_indices) if np.min(class_counts) < 2: raise ValueError('The least populated class in y has only 1 ' 'member, which is too few. The minimum number of labels ' 'for any class cannot be less than 2.') if n_train < n_classes: raise ValueError('The train_size=%d should be greater than or ' 'equal to the number of classes=%d' % (n_train, n_classes)) if n_test < n_classes: raise ValueError('The test_size=%d should be greater than or ' 'equal to the number of classes=%d' % (n_test, n_classes)) rng = check_random_state(self.random_state) p_i = class_counts / float(n_samples) n_i = np.round(n_train * p_i).astype(int) t_i = np.minimum(class_counts - n_i, np.round(n_test * p_i).astype(int)) for _ in range(self.n_splits): train = [] test = [] for i, class_i in enumerate(classes): permutation = rng.permutation(class_counts[i]) perm_indices_class_i = np.where((target == class_i))[0][permutation] train.extend(perm_indices_class_i[:n_i[i]]) test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]]) # Might end up here with less samples in train and test than we asked # for, due to rounding errors. if len(train) + len(test) < n_train + n_test: missing_indices = np.where(bincount(train + test, minlength=len(target)) == 0)[0] missing_indices = rng.permutation(missing_indices) n_missing_train = n_train - len(train) n_missing_test = n_test - len(test) if n_missing_train > 0: train.extend(missing_indices[:n_missing_train]) if n_missing_test > 0: test.extend(missing_indices[-n_missing_test:]) train = rng.permutation(train) test = rng.permutation(test) yield train, test
def test_sample_weight(): """Check sample weighting.""" # Test that zero-weighted samples are not taken into account X = np.arange(100)[:, np.newaxis] y = np.ones(100) y[:50] = 0.0 sample_weight = np.ones(100) sample_weight[y == 0] = 0.0 clf = tree.DecisionTreeClassifier() clf.fit(X, y, sample_weight=sample_weight) assert_array_equal(clf.predict(X), np.ones(100)) # Test that low weighted samples are not taken into account at low depth X = np.arange(200)[:, np.newaxis] y = np.zeros(200) y[50:100] = 1 y[100:200] = 2 X[100:200, 0] = 200 sample_weight = np.ones(200) sample_weight[y == 2] = .51 # Samples of class '2' are still weightier clf = tree.DecisionTreeClassifier(max_depth=1) clf.fit(X, y, sample_weight=sample_weight) assert_equal(clf.tree_.threshold[0], 149.5) sample_weight[y == 2] = .50 # Samples of class '2' are no longer weightier clf = tree.DecisionTreeClassifier(max_depth=1) clf.fit(X, y, sample_weight=sample_weight) assert_equal(clf.tree_.threshold[0], 49.5) # Threshold should have moved # Test that sample weighting is the same as having duplicates X = iris.data y = iris.target duplicates = rng.randint(0, X.shape[0], 1000) clf = tree.DecisionTreeClassifier(random_state=1) clf.fit(X[duplicates], y[duplicates]) from sklearn.utils.fixes import bincount sample_weight = bincount(duplicates, minlength=X.shape[0]) clf2 = tree.DecisionTreeClassifier(random_state=1) clf2.fit(X, y, sample_weight=sample_weight) internal = clf.tree_.children_left != tree._tree.TREE_LEAF assert_array_equal(clf.tree_.threshold[internal], clf2.tree_.threshold[internal]) # Test negative weights X = iris.data y = iris.target sample_weight = -np.ones(X.shape[0]) clf = tree.DecisionTreeClassifier(random_state=1) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) sample_weight = np.ones(X.shape[0]) sample_weight[0] = -1 clf = tree.DecisionTreeClassifier(random_state=1) clf.fit(X, y, sample_weight=sample_weight) # Check that predict_proba returns valid probabilities in the presence of # samples with negative weight X = iris.data y = iris.target sample_weight = rng.normal(.5, 1.0, X.shape[0]) clf = tree.DecisionTreeClassifier(random_state=1) clf.fit(X, y, sample_weight=sample_weight) proba = clf.predict_proba(X) assert (proba >= 0).all() and (proba <= 1).all()
def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, average='multiclass', sample_weight=None, correction=0.0): """Compute the geometric mean The geometric mean (G-mean) is the root of the product of class-wise sensitivity. This measure tries to maximize the accuracy on each of the classes while keeping these accuracies balanced. For binary classification G-mean is the squared root of the product of the sensitivity and specificity. For multi-class problems it is a higher root of the product of sensitivity for each class. For compatibility with other imbalance performance measures, G-mean can be calculated for each class separately on a one-vs-rest basis when ``average != 'multiclass'``. The best value is 1 and the worst value is 0. Traditionally if at least one class is unrecognized by the classifier, G-mean resolves to zero. To alleviate this property, for highly multi-class the sensitivity of unrecognized classes can be "corrected" to be a user specified value (instead of zero). This option works only if ``average == 'multiclass'``. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default=``'multiclass'``) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : ndarray, shape (n_samples, ) Sample weights. correction: float, optional (default=0.0) Substitutes sensitivity of unrecognized classes from zero to a given value. Returns ------- geometric_mean : float Examples -------- >>> from imblearn.metrics import geometric_mean_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> geometric_mean_score(y_true, y_pred) 0.0 >>> geometric_mean_score(y_true, y_pred, correction=0.001) 0.010000000000000004 >>> geometric_mean_score(y_true, y_pred, average='macro') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average='micro') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average='weighted') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average=None) array([ 0.8660254, 0. , 0. ]) References ---------- .. [1] Kubat, M. and Matwin, S. "Addressing the curse of imbalanced training sets: one-sided selection" ICML (1997) .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies for learning in class imbalance problems", Pattern Recognition, 36(3), (2003), pp 849-851. """ if average is None or average != 'multiclass': sen, spe, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('specificity', 'specificity'), sample_weight=sample_weight) LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe)) return np.sqrt(sen * spe) else: present_labels = unique_labels(y_true, y_pred) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack([labels, np.setdiff1d(present_labels, labels, assume_unique=True)]) le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = bincount(tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = tp_sum = np.zeros(len(labels)) if len(y_true): true_sum = bincount(y_true, weights=sample_weight, minlength=len(labels)) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] recall = _prf_divide(tp_sum, true_sum, "recall", "true", None, "recall") recall[recall == 0] = correction return sp.stats.mstats.gmean(recall)
def lda(X, y, tol=0.00001, n_components=None): """SVD solver. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data. y : array-like, shape (n_samples,) or (n_samples, n_targets) Target values. """ X, y = check_X_y(X, y) _, y_t = np.unique(y, return_inverse=True) priors_ = bincount(y_t) / float(len(y)) classes_ = np.unique(y) n_samples, n_features = X.shape n_classes = len(classes_) print_dynamic('Calculate Class Mean') means_ = _class_means(X, y) Xc = [] for idx, group in enumerate(classes_): Xg = X[y == group, :] Xc.append(Xg - means_[idx]) xbar_ = np.dot(priors_, means_) Xc = np.concatenate(Xc, axis=0) print_dynamic('# 1) within (univariate) scaling by with classes std-dev') std = Xc.std(axis=0) # avoid division by zero in normalization std[std == 0] = 1. fac = 1. / (n_samples - n_classes) print_dynamic('# 2) Within variance scaling') X = np.sqrt(fac) * (Xc / std) # SVD of centered (within)scaled data U, S, V = linalg.svd(X, full_matrices=False) rank = np.sum(S > tol) if rank < n_features: warnings.warn("Variables are collinear.") # Scaling of within covariance is: V' 1/S scalings = (V[:rank] / std).T / S[:rank] print_dynamic('# 3) Between variance scaling') # Scale weighted centers X = np.dot(((np.sqrt((n_samples * priors_) * fac)) * (means_ - xbar_).T).T, scalings) # Centers are living in a space with n_classes-1 dim (maximum) # Use SVD to find projection in the space spanned by the # (n_classes) centers _, S, V = linalg.svd(X, full_matrices=0) rank = np.sum(S > tol * S[0]) scalings_ = np.dot(scalings, V.T[:, :rank]) coef = np.dot(means_ - xbar_, scalings_) intercept_ = (-0.5 * np.sum(coef**2, axis=1) + np.log(priors_)) coef_ = np.dot(coef, scalings_.T) intercept_ -= np.dot(xbar_, coef_.T) return intercept_, coef_, classes_
def _parallel_build_balanced_estimators(n_estimators, ensemble, X, y, seeds, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = random_state.randint(MAX_INT) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) # TEF: Main change in this next call to _downsample (Xbal, ybal) = _downsample((X[indices])[:, features], y[indices]) estimator.fit(Xbal, ybal) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def grow_forest(forest, X, y, seeds, labels=None): """Grow a forest of random trees""" # Convert data X, = check_arrays(X, dtype=DTYPE, sparse_format="dense") # Make a list container for grown trees n_trees = forest.n_estimators trees = [] # For each tree in the forest for i in range(n_trees): # Make a np.random.RandomState instance from the tree's planting seed random_state = check_random_state(seeds[i]) # generate a random seed for a branching seed seed = random_state.randint(MAX_INT) # Make a decision tree object tree = forest._make_estimator(append=False) # Init the tree's RandomState instance with generated seed # this will randomize what features the tree will use tree.set_params(random_state=check_random_state(seed)) # If we are bootstraping if forest.bootstrap: # If we are given labels if labels is not None: # Then need to bootstrap via labels # We can do this by using StratifiedShuffleSplit # to gain a random sample from each lable sss = cross_validation.StratifiedShuffleSplit(labels, n_iter=1, test_size=np.unique(labels).size, random_state=check_random_state(seed)) # Then we'll bootstrap our X and y for the lable samples chosen for train, test in sss: X_lbs = X[test] y_lbs = y[test] break # Then get the number of samples n_samples = X_lbs.shape[0] # To generate a uniform sample weight curr_sample_weight = np.ones((n_samples,), dtype=np.float64) # Then randomly choses n_samples from all samples with replacement indices = random_state.randint(0, n_samples, n_samples) # Use this method of bincount to make a randome benning histogram # that will sum up to n_samples sample_counts = bincount(indices, minlength=n_samples) # Apply these randomized counts to the old uniform weights curr_sample_weight *= sample_counts # Fit the tree using these new sample weights tree.fit(X_lbs, y_lbs, sample_weight=curr_sample_weight, check_input=False) # Then set the indices of the tree only to the samples that had non-zero weights tree.indices_ = sample_counts > 0. else: # Then get the number of samples n_samples = X.shape[0] # To generate a uniform sample weight curr_sample_weight = np.ones((n_samples,), dtype=np.float64) # Then randomly choses n_samples from all samples with replacement indices = random_state.randint(0, n_samples, n_samples) # Use this method of bincount to make a randome benning histogram # that will sum up to n_samples sample_counts = bincount(indices, minlength=n_samples) # Apply these randomized counts to the old uniform weights curr_sample_weight *= sample_counts # Fit the tree using these new sample weights tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) # Then set the indices of the tree only to the samples that had non-zero weights tree.indices_ = sample_counts > 0. # If we aren't bootstraping else: # This just fit the data with no random weights tree.fit(X, y, check_input=False) # Add the grown tree to the container trees.append(tree) # return all of the trained trees return trees
def _parallel_build_ranking_estimators(n_estimators, ensemble, X, y, Q, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job. Now it supports queries and querywise sampling. It also breaks the PEP8 line length constraint now""" # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features uQueries = np.unique(Q) sample_whole_queries = False if hasattr(ensemble, "sample_whole_queries"): sample_whole_queries = ensemble.sample_whole_queries if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0): if sample_whole_queries: max_samples = int(max_samples * len(uQueries)) else: max_samples = int(max_samples * n_samples) if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: if sample_whole_queries: Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: if sample_whole_queries: notQindices = uQueries[random_state.randint(0, len(uQueries), len(uQueries) - max_samples)] notQindices.sort() not_indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state ) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, Q=Q, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0.0 # Draw samples, using a mask, and then fit else: if bootstrap: if sample_whole_queries: Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = random_state.randint(0, n_samples, max_samples) else: if sample_whole_queries: Qindices = uQueries[ sample_without_replacement(len(uQueries), max_samples, random_state=random_state) ] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices], Q=Q[indices]) samples = sample_counts > 0.0 estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = ("sample_weight" in getargspec(ensemble.base_estimator_.fit)[0]) # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0. # Draw samples, using a mask, and then fit else: if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices]) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def _make_test_folds(self, frame, y): if self.shuffle: rng = check_random_state(self.random_state) else: rng = self.random_state # validate that it's a string y = _val_y(y) # gets a string back or None if y is None: raise ValueError('H2OStratifiedKFold requires a target name (got None)') target = frame[y].as_data_frame(use_pandas=True)[y].values n_samples = target.shape[0] unique_y, y_inversed = np.unique(target, return_inverse=True) y_counts = bincount(y_inversed) min_labels = np.min(y_counts) if np.all(self.n_folds > y_counts): raise ValueError(('All the n_labels for individual classes' ' are less than %d folds.' % self.n_folds), Warning) if self.n_folds > min_labels: warnings.warn(('The least populated class in y has only %d' ' members, which is too few. The minimum' ' number of labels for any class cannot' ' be less than n_folds=%d.' % (min_labels, self.n_folds)), Warning) # NOTE FROM SKLEARN: # pre-assign each sample to a test fold index using individual KFold # splitting strategies for each class so as to respect the balance of # classes # NOTE: Passing the data corresponding to ith class say X[y==class_i] # will break when the data is not 100% stratifiable for all classes. # So we pass np.zeroes(max(c, n_folds)) as data to the KFold. # Remember, however that we might be using the old-fold KFold which doesn't # have a split method... if SK18: per_cls_cvs = [ KFold(self.n_folds, # using sklearn's KFold here shuffle=self.shuffle, random_state=rng).split(np.zeros(max(count, self.n_folds))) for count in y_counts ] else: per_cls_cvs = [ KFold(max(count, self.n_folds), # using sklearn's KFold here self.n_folds, shuffle=self.shuffle, random_state=rng) for count in y_counts ] test_folds = np.zeros(n_samples, dtype=np.int) for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)): for cls, (_, test_split) in zip(unique_y, per_cls_splits): cls_test_folds = test_folds[target == cls] # the test split can be too big because we used # KFold(...).split(X[:max(c, n_folds)]) when data is not 100% # stratifiable for all the classes # (we use a warning instead of raising an exception) # If this is the case, let's trim it: test_split = test_split[test_split < len(cls_test_folds)] cls_test_folds[test_split] = test_fold_indices test_folds[target == cls] = cls_test_folds return test_folds
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = ("sample_weight" in getargspec(ensemble.base_estimator_.fit)[0]) # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples, )) else: curr_sample_weight = sample_weight.copy() if bootstrap: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0. # Draw samples, using a mask, and then fit else: if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices]) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def _parallel_build_estimators(n_estimators, ensemble, all_X, all_y, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job.""" positives = np.where(all_y == 1)[0] unlabeled = np.where(all_y == 0)[0] X_positives = all_X[positives] X_unlabeled = all_X[unlabeled] y_positives = all_y[positives] y_unlabeled = all_y[unlabeled] # Retrieve settings n_samples, n_features = X_unlabeled.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features #can't currently support sample weights if sample_weight is not None: raise ValueError("Can't currently support sample weight with PUBagging") support_sample_weight = False #support_sample_weight = has_fit_parameter(ensemble.base_estimator_, # "sample_weight") #if not support_sample_weight and sample_weight is not None: # raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state) curr_sample_weight[not_indices] = 0 estimator.fit(all_X[:, features], all_y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0. # Draw samples, using a mask, and then fit else: if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) new_X=np.vstack((X_positives, X_unlabeled[indices])) new_y=np.concatenate((y_positives, y_unlabeled[indices])) estimator.fit(new_X[:, features], new_y) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def label_ranking_loss(y_true, y_score, sample_weight=None): """Compute Ranking loss measure Compute the average number of label pairs that are incorrectly ordered given y_score weighted by the size of the label set and the number of labels not in the label set. This is similar to the error set size, but weighted by the number of relevant and irrelevant labels. The best performance is achieved with a ranking loss of zero. Read more in the :ref:`User Guide <label_ranking_loss>`. .. versionadded:: 0.17 A function *label_ranking_loss* Parameters ---------- y_true : array or sparse matrix, shape = [n_samples, n_labels] True binary labels in binary indicator format. y_score : array, shape = [n_samples, n_labels] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- loss : float References ---------- .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). Mining multi-label data. In Data mining and knowledge discovery handbook (pp. 667-685). Springer US. """ y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr') y_score = check_array(y_score, ensure_2d=False) check_consistent_length(y_true, y_score, sample_weight) y_type = type_of_target(y_true) if y_type not in ("multilabel-indicator", ): raise ValueError("{0} format is not supported".format(y_type)) if y_true.shape != y_score.shape: raise ValueError("y_true and y_score have different shape") n_samples, n_labels = y_true.shape y_true = csr_matrix(y_true) loss = np.zeros(n_samples) for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): # Sort and bin the label scores unique_scores, unique_inverse = np.unique(y_score[i], return_inverse=True) true_at_reversed_rank = bincount( unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores)) all_at_reversed_rank = bincount(unique_inverse, minlength=len(unique_scores)) false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank # if the scores are ordered, it's possible to count the number of # incorrectly ordered paires in linear time by cumulatively counting # how many false labels of a given score have a score higher than the # accumulated true labels with lower score. loss[i] = np.dot(true_at_reversed_rank.cumsum(), false_at_reversed_rank) n_positives = count_nonzero(y_true, axis=1) with np.errstate(divide="ignore", invalid="ignore"): loss /= ((n_labels - n_positives) * n_positives) # When there is no positive or no negative labels, those values should # be consider as correct, i.e. the ranking doesn't matter. loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0. return np.average(loss, weights=sample_weight)