def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)

    # Compute a center for each label
    # For each label, average over samples and features
    # TODO: IMPLEMENT
        # Take all of the samples in a cluster and average their features

    return centers
Exemple #2
0
def check_min_weight_fraction_leaf(name):
    X, y = hastie_X, hastie_y

    # Test if leaves contain at least min_weight_fraction_leaf of the
    # training set
    ForestEstimator = FOREST_ESTIMATORS[name]
    rng = np.random.RandomState(0)
    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for frac in np.linspace(0, 0.5, 6):
        est = ForestEstimator(min_weight_fraction_leaf=frac,
                              n_estimators=1,
                              random_state=0)
        if "RandomForest" in name:
            est.bootstrap = False

        est.fit(X, y, sample_weight=weights)
        out = est.estimators_[0].tree_.apply(X)
        node_weights = bincount(out, weights=weights)
        # drop inner nodes
        leaf_weights = node_weights[node_weights != 0]
        assert_greater_equal(
            np.min(leaf_weights), total_weight * est.min_weight_fraction_leaf,
            "Failed with {0} "
            "min_weight_fraction_leaf={1}".format(
                name, est.min_weight_fraction_leaf))
    def fit(self, X):
        """
        -	It should train the BM25 model on the given corpus docs
        -	Return nothing
        """
        X = self.tf_vectorizer.fit_transform(X).toarray()
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        n_samples, n_features = X.shape

        if sp.isspmatrix_csr(X):
            df = bincount(X.indices, minlength=X.shape[1])
        else:
            df = np.diff(sp.csc_matrix(X, copy=False).indptr)

        #compute idf weight
        #idf = np.log((float(n_samples)-df+0.5)/(df+0.5))
        idf = np.log(float(n_samples) / df) + 1.0
        self._idf_diag = sp.spdiags(idf,
                                    diags=0,
                                    m=n_features,
                                    n=n_features,
                                    format='csr')
        #compute the length for each document and average length of the corpus
        doc_len = np.sum(X, axis=1)
        self._doc_len = np.reshape(doc_len, (n_samples, 1))
        self._avgdl = np.sum(X) / n_samples
    def _make_test_folds(self, X, y=None, groups=None):
        if self.shuffle:
            rng = check_random_state(self.random_state)
        else:
            rng = self.random_state
        y = np.asarray(y)
        n_samlples = len(X)
        y = ','.join(y).split(',')
        unique_y, y_inversed = np.unique(y, return_inverse=True)
        y_counts = bincount(y_inversed)
        min_groups = np.min(y_counts)
        if np.all(self.n_splits > y_counts):
            raise ValueError("All the n_groups for individual classes"
                             " are less than n_splits=%d."
                             % (self.n_splits))
        if self.n_splits > min_groups:
            warnings.warn(("The least populated class in y has only %d"
                           " members, which is too few. The minimum"
                           " number of groups for any class cannot"
                           " be less than n_splits=%d."
                           % (min_groups, self.n_splits)), Warning)

        # pre-assign each sample to a test fold index using individual KFold
        # splitting strategies for each class so as to respect the balance of
        # classes

        # NOTE: Passing the data corresponding to ith class say X[y==class_i]
        # will break when the data is not 100% stratifiable for all classes.
        # So we pass np.zeroes(max(c, n_splits)) as data to the KFold
        test_folds = iterative_stratification(X, set(y), self.n_splits, rng)
        return test_folds
def check_min_weight_fraction_leaf(name, X, y):
    # Test if leaves contain at least min_weight_fraction_leaf of the
    # training set
    ForestEstimator = FOREST_ESTIMATORS[name]
    rng = np.random.RandomState(0)
    weights = rng.rand(X.shape[0])
    total_weight = np.sum(weights)

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes in (None, 1000):
        for frac in np.linspace(0, 0.5, 6):
            est = ForestEstimator(min_weight_fraction_leaf=frac,
                                  max_leaf_nodes=max_leaf_nodes,
                                  random_state=0)
            if isinstance(est, (RandomForestClassifier,
                                RandomForestRegressor)):
                est.bootstrap = False
            est.fit(X, y, sample_weight=weights)
            out = est.estimators_[0].tree_.apply(X)
            node_weights = bincount(out, weights=weights)
            # drop inner nodes
            leaf_weights = node_weights[node_weights != 0]
            assert_greater_equal(
                np.min(leaf_weights),
                total_weight * est.min_weight_fraction_leaf,
                "Failed with {0} "
                "min_weight_fraction_leaf={1}".format(
                    name, est.min_weight_fraction_leaf))
Exemple #6
0
    def transform(self, X, y=None):
        n = X.shape[0]
        # check if column is in X
        if self.var not in X.columns:
            ValueError("X does not contain variable {} ".format(self.var))

        # check if column is categorical, if not categorize
        classes, y_indices = np.unique(y, return_inverse=True)
        class_counts = bincount(y_indices)
        if np.min(class_counts) < 2:
            X['%s__cat' % self.var] = pd.cut(X[self.var], self.nbins).cat.codes
            var = '%s__cat' % self.var
        else:
            var = self.var

        # compute budget per bin
        if isinstance(self.sample_size, int):
            budget_per_bin = int(self.sample_size / self.nbins)
        else:
            budget_per_bin = int(self.sample_size / self.nbins * X.shape[0])

        obs_by_cat = X.groupby(var).count().loc[:, self.var]

        keep_indexes = []
        for i in obs_by_cat.index:
            if obs_by_cat.iloc[i] <= budget_per_bin:
                keep_indexes += list(np.where(X[var] == i)[0])
            else:
                keep_indexes += random.sample(list(np.where(X[var] == i)[0]),
                                              budget_per_bin)

        return X.iloc[keep_indexes]
Exemple #7
0
def check_min_samples_leaf(name):
    X, y = hastie_X, hastie_y

    # Test if leaves contain more than leaf_count training examples
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test boundary value
    assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y)
    assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y)

    est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name))

    est = ForestEstimator(min_samples_leaf=0.25,
                          n_estimators=1,
                          random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = np.bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count),
                   len(X) * 0.25 - 1, "Failed with {0}".format(name))
Exemple #8
0
    def transform(self, X, y=None):
        # n = X.shape[0]

        # check if column is in X
        if self.var not in X.columns:
            ValueError("X does not contain variable {} ".format(self.var))

        # check if column is categorical, if not categorize
        classes, y_indices = np.unique(y, return_inverse=True)
        class_counts = bincount(y_indices)
        if np.min(class_counts) < 2:
            X['%s__cat' % self.var] = pd.cut(X[self.var], self.nbins).cat.codes
            var = '%s__cat' % self.var
        else:
            var = self.var

        split = StratifiedShuffleSplit(n_splits=1,
                                       train_size=self.sample_size,
                                       random_state=self.random_state)
        for train_index, test_index in split.split(X, X[var]):
            XX = X.iloc[train_index, :]
            # XX = X.loc[test_index]

        if '%s__cat' % self.var in X.columns:
            XX.drop('%s__cat' % self.var, axis=1, inplace=True)

        return XX
def check_min_samples_leaf(name):
    X, y = hastie_X, hastie_y

    # Test if leaves contain more than leaf_count training examples
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test boundary value
    assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y)
    assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y)

    est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name))

    est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = np.bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count), len(X) * 0.25 - 1, "Failed with {0}".format(name))
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
                          verbose=0, class_weight=None):
    """Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples)
        sample_counts = bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto', y, indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced', y, indices)

        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    else:
        tree.fit(X, y, sample_weight=sample_weight, check_input=False)

    return tree
Exemple #11
0
def _parallel_build_trees(n_trees, forest, X, y, sample_weight, seeds,
                          verbose):
    """Private function used to build a batch of trees within a job."""
    trees = []

    for i in range(n_trees):
        random_state = check_random_state(seeds[i])
        if verbose > 1:
            print("building tree %d of %d" % (i + 1, n_trees))
        seed = random_state.randint(MAX_INT)

        tree = forest._make_estimator(append=False)
        tree.set_params(random_state=check_random_state(seed))

        if forest.bootstrap:
            n_samples = X.shape[0]
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
            else:
                curr_sample_weight = sample_weight.copy()

            indices = random_state.randint(0, n_samples, n_samples)
            sample_counts = bincount(indices, minlength=n_samples)
            curr_sample_weight *= sample_counts

            tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)

            tree.indices_ = sample_counts > 0.

        else:
            tree.fit(X, y, sample_weight=sample_weight, check_input=False)

        trees.append(tree)

    return trees
def _generate_unsampled_indices(random_state, n_samples):
    '''Samples out of bag'''
    sample_indices = _generate_sample_indices(random_state, n_samples)
    sample_counts = bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]
    return unsampled_indices
Exemple #13
0
def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight,
                                   sample_mask, X_argsorted, seed, verbose):
    """Private function used to build a batch of trees within a job"""
    from sklearn.utils import check_random_state
    from sklearn.utils.fixes import bincount
    import random
    MAX_INT = numpy.iinfo(numpy.int32).max
    random_state = check_random_state(seed)

    trees = []
    for i in xrange(n_trees):
        if verbose > 1:
            print("building tree %d of %d" % (i + 1, n_trees))
        seed = random_state.randint(MAX_INT)

        tree = forest._make_estimator(append=False)
        tree.set_params(compute_importances=forest.compute_importances)
        tree.set_params(random_state=check_random_state(seed))

        if forest.bootstrap:
            n_samples = X.shape[0]
            if sample_weight is None:
                curr_sample_weight = numpy.ones((n_samples, ),
                                                dtype=numpy.float64)
            else:
                curr_sample_weight = sample_weight.copy()

            ty = list(enumerate(y))
            indices = DataUtils.FilterData(ty,
                                           val=1,
                                           frac=0.5,
                                           col=1,
                                           indicesToUse=0,
                                           indicesOnly=1)[0]
            indices2 = random_state.randint(0, len(indices), len(indices))
            indices = [indices[j] for j in indices2]
            sample_counts = bincount(indices, minlength=n_samples)

            curr_sample_weight *= sample_counts
            curr_sample_mask = sample_mask.copy()
            curr_sample_mask[sample_counts == 0] = False

            tree.fit(X,
                     y,
                     sample_weight=curr_sample_weight,
                     sample_mask=curr_sample_mask,
                     X_argsorted=X_argsorted,
                     check_input=False)
            tree.indices = curr_sample_mask
        else:
            tree.fit(X,
                     y,
                     sample_weight=sample_weight,
                     sample_mask=sample_mask,
                     X_argsorted=X_argsorted,
                     check_input=False)
        trees.append(tree)
    return trees
def _generate_unsampled_indices(random_state, n_samples):
    """Private function used to forest._set_oob_score function."""
    sample_indices = _generate_sample_indices(random_state, n_samples)
    sample_counts = bincount(sample_indices, minlength=n_samples)
    unsampled_mask = sample_counts == 0
    indices_range = np.arange(n_samples)
    unsampled_indices = indices_range[unsampled_mask]

    return unsampled_indices
    def entropy(samples):
        n_samples = len(samples)
        entropy = 0.

        for count in bincount(samples):
            p = 1. * count / n_samples
            if p > 0:
                entropy -= p * np.log2(p)

        return entropy
Exemple #16
0
    def entropy(samples):
        n_samples = len(samples)
        entropy = 0.

        for count in bincount(samples):
            p = 1. * count / n_samples
            if p > 0:
                entropy -= p * np.log2(p)

        return entropy
Exemple #17
0
def compute_class_weight(labelPath):
    with open(labelPath, 'r') as text_file:
        content = text_file.readlines()
    content = np.asarray(content)
    y = np.asarray(
        [int(sample.split(' ')[2].strip('\n')) for sample in content])

    classes = np.asarray(list(set(y)))
    le = LabelEncoder()
    y_ind = le.fit_transform(y)
    recip_freq = len(y) / (len(le.classes_) *
                           bincount(y_ind).astype(np.float64))
    weight = recip_freq[le.transform(classes)]
    return weight
Exemple #18
0
def test_sample_weight():
    """Check sample weighting."""
    # Test that zero-weighted samples are not taken into account
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    sample_weight = np.ones(100)
    sample_weight[y == 0] = 0.0

    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(clf.predict(X), np.ones(100))

    # Test that low weighted samples are not taken into account at low depth
    X = np.arange(200)[:, np.newaxis]
    y = np.zeros(200)
    y[50:100] = 1
    y[100:200] = 2
    X[100:200, 0] = 200

    sample_weight = np.ones(200)

    sample_weight[y == 2] = .51  # Samples of class '2' are still weightier
    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 149.5)

    sample_weight[y == 2] = .50  # Samples of class '2' are no longer weightier
    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 49.5)  # Threshold should have moved

    # Test that sample weighting is the same as having duplicates
    X = iris.data
    y = iris.target

    duplicates = rng.randint(0, X.shape[0], 200)

    clf = DecisionTreeClassifier(random_state=1)
    clf.fit(X[duplicates], y[duplicates])

    sample_weight = bincount(duplicates, minlength=X.shape[0])
    clf2 = DecisionTreeClassifier(random_state=1)
    clf2.fit(X, y, sample_weight=sample_weight)

    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
    assert_array_almost_equal(clf.tree_.threshold[internal],
                              clf2.tree_.threshold[internal])
Exemple #19
0
def test_sample_weight():
    """Check sample weighting."""
    # Test that zero-weighted samples are not taken into account
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    sample_weight = np.ones(100)
    sample_weight[y == 0] = 0.0

    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(clf.predict(X), np.ones(100))

    # Test that low weighted samples are not taken into account at low depth
    X = np.arange(200)[:, np.newaxis]
    y = np.zeros(200)
    y[50:100] = 1
    y[100:200] = 2
    X[100:200, 0] = 200

    sample_weight = np.ones(200)

    sample_weight[y == 2] = .51  # Samples of class '2' are still weightier
    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 149.5)

    sample_weight[y == 2] = .50  # Samples of class '2' are no longer weightier
    clf = DecisionTreeClassifier(max_depth=1, random_state=0)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 49.5)  # Threshold should have moved

    # Test that sample weighting is the same as having duplicates
    X = iris.data
    y = iris.target

    duplicates = rng.randint(0, X.shape[0], 200)

    clf = DecisionTreeClassifier(random_state=1)
    clf.fit(X[duplicates], y[duplicates])

    sample_weight = bincount(duplicates, minlength=X.shape[0])
    clf2 = DecisionTreeClassifier(random_state=1)
    clf2.fit(X, y, sample_weight=sample_weight)

    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
    assert_array_almost_equal(clf.tree_.threshold[internal],
                              clf2.tree_.threshold[internal])
def check_min_samples_leaf(name, X, y):
    # Test if leaves contain more than leaf_count training examples
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes in (None, 1000):
        est = ForestEstimator(min_samples_leaf=5,
                              max_leaf_nodes=max_leaf_nodes,
                              random_state=0)
        est.fit(X, y)
        out = est.estimators_[0].tree_.apply(X)
        node_counts = bincount(out)
        # drop inner nodes
        leaf_count = node_counts[node_counts != 0]
        assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name))
def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)


    # Compute a center for each label
    # For each label, average over samples and features
    #TODO: IMPLEMENT
        # Take all of the samples in a cluster and add their features
    # For each sample
    # What label is it? Let's say its label x
    # Add feature i to label X's feature value i
    for sample_idx in xrange(n_samples):
        label = labels[sample_idx]

        centers[label] += X[sample_idx]
        #for j in xrange(n_features):
        #   centers[label[j]] +=X[sample_idx[j]]


    # Normalize by the size of the cluster
    centers /= n_samples_in_cluster[:, np.newaxis]

    return centers
def check_min_samples_leaf(name, X, y):
    # Test if leaves contain more than leaf_count training examples
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
    # by setting max_leaf_nodes
    for max_leaf_nodes in (None, 1000):
        est = ForestEstimator(min_samples_leaf=5,
                              max_leaf_nodes=max_leaf_nodes,
                              random_state=0)
        est.fit(X, y)
        out = est.estimators_[0].tree_.apply(X)
        node_counts = bincount(out)
        # drop inner nodes
        leaf_count = node_counts[node_counts != 0]
        assert_greater(np.min(leaf_count), 4,
                       "Failed with {0}".format(name))
Exemple #23
0
def _recompute_centers(X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]

    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)

    # Compute a center for each label
    # For each label, average over samples and features
    #TODO: IMPLEMENT
    # For each sample
    for sample_idx in xrange(n_samples):
        # What label is it? Let's say its label is 'label'
        label = labels[sample_idx]
        # Add feature i to label X's feature value i
        centers[label] += X[sample_idx]
        # for j in xrange(n_features):
        #     centers[label][j] += X[sample_idx][j]

    # Normalize by the size of the cluster
    centers /= n_samples_in_cluster[:, np.newaxis]

    return centers
Exemple #24
0
    def fit(self, X, y, tol=None):
        """Fit the model according to the given training data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array, shape = [n_samples]
            Target values (integers)
        """

        # X, y = check_X_y(X, y)
        if type_of_target(y) not in ['binary', 'multiclass']:
            raise ValueError("Unknown label type: %r" % type_of_target(y))
        self.classes_, y = np.unique(y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        self.startprob_ = (bincount(y)+1.0) / (len(y)+n_classes)
        transmat = np.zeros((n_classes,n_classes))
        for i in xrange(len(y)-1):
            transmat[y[i],y[i+1]] = transmat[y[i],y[i+1]] + 1
        transmat = (transmat.transpose() / np.sum(transmat,1)).transpose()
        self.transmat_ = transmat
        pseudo_rows = np.tile(self.pseudo_rssi_list,(X.shape[1],1)).transpose()
        means = []
        covars = []
        miss_probs = []
        for cl in xrange(n_classes):
            X_cl = np.concatenate((X[y == cl, :],pseudo_rows),0)
            miss_probs_cl = np.mean(np.isnan(X_cl),0)
            mean_cl = np.nanmean(X_cl,0)
            covar_cl = np.diag(np.nanvar(X_cl,0,ddof=1))
            miss_probs.append(miss_probs_cl)
            means.append(mean_cl)
            covars.append(covar_cl)
        self.miss_probs_ = np.asarray(miss_probs)
        self.means_ = np.asarray(means)
        self.covars_ = np.asarray(covars)
        return self
def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose):
    """Private function used to build a batch of trees within a job"""
    from sklearn.utils import check_random_state
    from sklearn.utils.fixes import bincount
    import random
    MAX_INT = numpy.iinfo(numpy.int32).max
    random_state = check_random_state(seed)

    trees = []
    for i in xrange(n_trees):
        if verbose > 1:
            print("building tree %d of %d" % (i+1, n_trees))
        seed = random_state.randint(MAX_INT)

        tree = forest._make_estimator(append = False)
        tree.set_params(compute_importances=forest.compute_importances)
        tree.set_params(random_state = check_random_state(seed))

        if forest.bootstrap:
            n_samples = X.shape[0]
            if sample_weight is None:
                curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64)
            else:
                curr_sample_weight = sample_weight.copy()

            ty = list(enumerate(y))
            indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0]
            indices2 = random_state.randint(0, len(indices), len(indices))
            indices = [indices[j] for j in indices2]
            sample_counts = bincount(indices, minlength=n_samples)

            curr_sample_weight *= sample_counts
            curr_sample_mask = sample_mask.copy()
            curr_sample_mask[sample_counts==0] = False

            tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False)
            tree.indices = curr_sample_mask
        else:
            tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False)
        trees.append(tree)
    return trees
    def _iter_indices(self):
        rng = np.random.RandomState(self.random_state)
        cls_count = bincount(self.y_indices)

        for n in range(self.n_iter):
            train = []
            test = []

            for i, cls in enumerate(self.classes):
                sample_size = int(cls_count[i]*(1-self.test_size))
                randint = rng.randint(cls_count[i], size=sample_size)
                aidx = np.where((self.y == cls))[0]
                iidx = aidx[randint]
                oidx = aidx[list(set(range(cls_count[i])).difference(set(randint)))]

                train.extend(iidx)
                test.extend(oidx)

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test
def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)


    # Compute a center for each label
    # For each label, average over samples and features
    #TODO: IMPLEMENT
    # 1. For each sample
    # 2. What label is it? Let's say its label is 'label'
    # 3. Add feature X's feature i to centers[label] feature value i

    # Normalize by the size of the cluster
    centers /= n_samples_in_cluster[:, np.newaxis]

    return centers
Exemple #28
0
def _recompute_centers( X, labels, n_clusters):
    """
    Computation of cluster centers / means.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)

    labels: array of integers, shape (n_samples)
        Current label assignment

    n_clusters: int
        Number of desired clusters

    Returns
    -------
    centers: array, shape (n_clusters, n_features)
        The resulting centers
    """

    n_samples = X.shape[0]
    n_features = X.shape[1]
   
    # Initialize centers to all zero
    centers = np.zeros((n_clusters, n_features))
    n_samples_in_cluster = bincount(labels, minlength=n_clusters)


    # Compute a center for each label
    # For each label, average over samples and features
    #TODO: IMPLEMENT
    for i in range(n_samples):
        for j in range(n_features):
            centers[labels[i], j] += X[i, j]

    # Normalize by the size of the cluster
    centers /= n_samples_in_cluster[:, np.newaxis]

    return centers
Exemple #29
0
    def _prefit(self, X, y):
        '''Doc String'''

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.classes_, y = np.unique(y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = len(self.classes_)
        if n_classes < 2:
            raise ValueError('y has less than 2 classes')
        if self.priors is None:
            self.priors_ = bincount(y) / float(n_samples)
        else:
            self.priors_ = self.priors

        if (self.priors_ < 0).any():
            raise ValueError("priors must be non-negative")
        if self.priors_.sum() != 1:
            warnings.warn("The priors do not sum to 1. Renormalizing",
                          UserWarning)
            self.priors_ = self.priors_ / self.priors_.sum()

        return X, y
    def _iter_indices(self):
        rng = np.random.RandomState(self.random_state)
        cls_count = bincount(self.y_indices)

        for n in range(self.n_iter):
            train = []
            test = []

            for i, cls in enumerate(self.classes):
                sample_size = int(cls_count[i] * (1 - self.test_size))
                randint = rng.randint(cls_count[i], size=sample_size)
                aidx = np.where((self.y == cls))[0]
                iidx = aidx[randint]
                oidx = aidx[list(
                    set(range(cls_count[i])).difference(set(randint)))]

                train.extend(iidx)
                test.extend(oidx)

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test
def sensitivity_specificity_support(y_true,
                                    y_pred,
                                    labels=None,
                                    pos_label=1,
                                    average=None,
                                    warn_for=('sensitivity', 'specificity'),
                                    sample_weight=None):
    """Compute sensitivity, specificity, and support for each class

    The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
    of true positives and ``fn`` the number of false negatives. The sensitivity
    quantifies the ability to avoid false negatives_[1].

    The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number
    of true negatives and ``fn`` the number of false negatives. The specificity
    quantifies the ability to avoid false positives_[1].

    The support is the number of occurrences of each class in ``y_true``.

    If ``pos_label is None`` and in binary classification, this function
    returns the average sensitivity and specificity if ``average``
    is one of ``'weighted'``.

    Parameters
    ----------
    y_true : ndarray, shape (n_samples, )
        Ground truth (correct) target values.

    y_pred : ndarray, shape (n_samples, )
        Estimated targets as returned by a classifier.

    labels : list, optional
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, for example to calculate a multiclass average ignoring a
        majority negative class, while labels not present in the data will
        result in 0 components in a macro average. For multilabel targets,
        labels are column indices. By default, all labels in ``y_true`` and
        ``y_pred`` are used in sorted order.

    pos_label : str or int, optional (default=1)
        The class to report if ``average='binary'`` and the data is binary.
        If the data are multiclass, this will be ignored;
        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
        scores for that label only.

    average : str or None, optional (default=None)
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

        ``'binary'``:
            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
        ``'micro'``:
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
        ``'macro'``:
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
        ``'weighted'``:
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
        ``'samples'``:
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from
            :func:`accuracy_score`).
    warn_for : tuple or set, for internal use
        This determines which warnings will be made in the case that this
        function is being used to return only one of its metrics.

    sample_weight : ndarray, shape (n_samples, )
        Sample weights.

    Returns
    -------
    sensitivity : float (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )

    specificity : float (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )

    support : int (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )
        The number of occurrences of each label in ``y_true``.

    Examples
    --------
    >>> import numpy as np
    >>> from imblearn.metrics import sensitivity_specificity_support
    >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
    >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
    >>> sensitivity_specificity_support(y_true, y_pred, average='macro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='micro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='weighted')
    (0.33333333333333331, 0.66666666666666663, None)

    References
    ----------
    .. [1] `Wikipedia entry for the Sensitivity and specificity
           <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_

    """
    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
    if average not in average_options and average != 'binary':
        raise ValueError('average has to be one of ' + str(average_options))

    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    present_labels = unique_labels(y_true, y_pred)

    if average == 'binary':
        if y_type == 'binary':
            if pos_label not in present_labels:
                if len(present_labels) < 2:
                    # Only negative labels
                    return (0., 0., 0)
                else:
                    raise ValueError("pos_label=%r is not a valid label: %r" %
                                     (pos_label, present_labels))
            labels = [pos_label]
        else:
            raise ValueError("Target is %s but average='binary'. Please "
                             "choose another average setting." % y_type)
    elif pos_label not in (None, 1):
        warnings.warn("Note that pos_label (set to %r) is ignored when "
                      "average != 'binary' (got %r). You may use "
                      "labels=[pos_label] to specify a single positive class."
                      % (pos_label, average), UserWarning)

    if labels is None:
        labels = present_labels
        n_labels = None
    else:
        n_labels = len(labels)
        labels = np.hstack(
            [labels, np.setdiff1d(
                present_labels, labels, assume_unique=True)])

    # Calculate tp_sum, pred_sum, true_sum ###

    if y_type.startswith('multilabel'):
        raise ValueError('imblearn does not support multilabel')
    elif average == 'samples':
        raise ValueError("Sample-based precision, recall, fscore is "
                         "not meaningful outside multilabel "
                         "classification. See the accuracy_score instead.")
    else:
        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]
        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = bincount(
                tp_bins, weights=tp_bins_weights, minlength=len(labels))
        else:
            # Pathological case
            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
        if len(y_pred):
            pred_sum = bincount(
                y_pred, weights=sample_weight, minlength=len(labels))
        if len(y_true):
            true_sum = bincount(
                y_true, weights=sample_weight, minlength=len(labels))

        # Compute the true negative
        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]
        pred_sum = pred_sum[indices]
        tn_sum = tn_sum[indices]

    if average == 'micro':
        tp_sum = np.array([tp_sum.sum()])
        pred_sum = np.array([pred_sum.sum()])
        true_sum = np.array([true_sum.sum()])
        tn_sum = np.array([tn_sum.sum()])

    # Finally, we have all our sufficient statistics. Divide! #

    with np.errstate(divide='ignore', invalid='ignore'):
        # Divide, and on zero-division, set scores to 0 and warn:

        # Oddly, we may get an "invalid" rather than a "divide" error
        # here.
        specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum,
                                  'specificity', 'predicted', average,
                                  warn_for)
        sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true',
                                  average, warn_for)

    # Average the results

    if average == 'weighted':
        weights = true_sum
        if weights.sum() == 0:
            return 0, 0, None
    elif average == 'samples':
        weights = sample_weight
    else:
        weights = None

    if average is not None:
        assert average != 'binary' or len(specificity) == 1
        specificity = np.average(specificity, weights=weights)
        sensitivity = np.average(sensitivity, weights=weights)
        true_sum = None  # return no support

    return sensitivity, specificity, true_sum
Exemple #32
0
# -*- coding: utf-8 -*-
def _document_frequency(X):
    """Count the number of non-zero values for each feature in sparse X."""
    if sp.isspmatrix_csr(X):
        return bincount(X.indices, minlength=X.shape[1])
    else:
        return np.diff(sp.csc_matrix(X, copy=False).indptr)
def _document_frequency(X):
    """Count the number of non-zero values for each feature in sparse X."""
    if sp.isspmatrix_csr(X):
        return bincount(X.indices, minlength=X.shape[1])
    else:
        return np.diff(sp.csc_matrix(X, copy=False).indptr)
Exemple #35
0
    def _iter_indices(self, frame, y):
        """Iterate the indices with stratification.

        Parameters
        ----------

        frame : H2OFrame
            The frame to split

        y : string
            The column to stratify.

        Returns
        -------

        train : np.ndarray, shape=(n_samples,)
            The train indices

        test : np.ndarray, shape=(n_samples,)
            The test indices
        """
        n_samples = frame.shape[0]
        n_train, n_test = _validate_shuffle_split(n_samples,
                                                  self.test_size, self.train_size)

        # need to validate y...
        y = _val_y(y)
        target = np.asarray(frame[y].as_data_frame(use_pandas=True)[y].tolist())

        classes, y_indices = np.unique(target, return_inverse=True)
        n_classes = classes.shape[0]

        class_counts = bincount(y_indices)
        if np.min(class_counts) < 2:
            raise ValueError('The least populated class in y has only 1 '
                             'member, which is too few. The minimum number of labels '
                             'for any class cannot be less than 2.')

        if n_train < n_classes:
            raise ValueError('The train_size=%d should be greater than or '
                             'equal to the number of classes=%d' % (n_train, n_classes))

        if n_test < n_classes:
            raise ValueError('The test_size=%d should be greater than or '
                             'equal to the number of classes=%d' % (n_test, n_classes))

        rng = check_random_state(self.random_state)
        p_i = class_counts / float(n_samples)
        n_i = np.round(n_train * p_i).astype(int)
        t_i = np.minimum(class_counts - n_i, np.round(n_test * p_i).astype(int))

        for _ in range(self.n_splits):
            train = []
            test = []

            for i, class_i in enumerate(classes):
                permutation = rng.permutation(class_counts[i])
                perm_indices_class_i = np.where((target == class_i))[0][permutation]

                train.extend(perm_indices_class_i[:n_i[i]])
                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])

            # Might end up here with less samples in train and test than we asked
            # for, due to rounding errors.
            if len(train) + len(test) < n_train + n_test:
                missing_indices = np.where(bincount(train + test, minlength=len(target)) == 0)[0]
                missing_indices = rng.permutation(missing_indices)
                n_missing_train = n_train - len(train)
                n_missing_test = n_test - len(test)

                if n_missing_train > 0:
                    train.extend(missing_indices[:n_missing_train])
                if n_missing_test > 0:
                    test.extend(missing_indices[-n_missing_test:])

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test
Exemple #36
0
def test_sample_weight():
    """Check sample weighting."""
    # Test that zero-weighted samples are not taken into account
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    sample_weight = np.ones(100)
    sample_weight[y == 0] = 0.0

    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(clf.predict(X), np.ones(100))

    # Test that low weighted samples are not taken into account at low depth
    X = np.arange(200)[:, np.newaxis]
    y = np.zeros(200)
    y[50:100] = 1
    y[100:200] = 2
    X[100:200, 0] = 200

    sample_weight = np.ones(200)

    sample_weight[y == 2] = .51  # Samples of class '2' are still weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 149.5)

    sample_weight[y == 2] = .50  # Samples of class '2' are no longer weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 49.5)  # Threshold should have moved

    # Test that sample weighting is the same as having duplicates
    X = iris.data
    y = iris.target

    duplicates = rng.randint(0, X.shape[0], 1000)

    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X[duplicates], y[duplicates])

    from sklearn.utils.fixes import bincount
    sample_weight = bincount(duplicates, minlength=X.shape[0])
    clf2 = tree.DecisionTreeClassifier(random_state=1)
    clf2.fit(X, y, sample_weight=sample_weight)

    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
    assert_array_equal(clf.tree_.threshold[internal],
                       clf2.tree_.threshold[internal])

    # Test negative weights
    X = iris.data
    y = iris.target

    sample_weight = -np.ones(X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)

    sample_weight = np.ones(X.shape[0])
    sample_weight[0] = -1
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)

    # Check that predict_proba returns valid probabilities in the presence of
    # samples with negative weight
    X = iris.data
    y = iris.target

    sample_weight = rng.normal(.5, 1.0, X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)
    proba = clf.predict_proba(X)
    assert (proba >= 0).all() and (proba <= 1).all()
Exemple #37
0
def geometric_mean_score(y_true,
                         y_pred,
                         labels=None,
                         pos_label=1,
                         average='multiclass',
                         sample_weight=None,
                         correction=0.0):
    """Compute the geometric mean

    The geometric mean (G-mean) is the root of the product of class-wise
    sensitivity. This measure tries to maximize the accuracy on each of the
    classes while keeping these accuracies balanced. For binary classification
    G-mean is the squared root of the product of the sensitivity
    and specificity. For multi-class problems it is a higher root of the
    product of sensitivity for each class.

    For compatibility with other imbalance performance measures, G-mean can be
    calculated for each class separately on a one-vs-rest basis when
    ``average != 'multiclass'``.

    The best value is 1 and the worst value is 0. Traditionally if at least one
    class is unrecognized by the classifier, G-mean resolves to zero. To
    alleviate this property, for highly multi-class the sensitivity of
    unrecognized classes can be "corrected" to be a user specified value
    (instead of zero). This option works only if ``average == 'multiclass'``.

    Parameters
    ----------
    y_true : ndarray, shape (n_samples, )
        Ground truth (correct) target values.

    y_pred : ndarray, shape (n_samples, )
        Estimated targets as returned by a classifier.

    labels : list, optional
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, for example to calculate a multiclass average ignoring a
        majority negative class, while labels not present in the data will
        result in 0 components in a macro average.

    pos_label : str or int, optional (default=1)
        The class to report if ``average='binary'`` and the data is binary.
        If the data are multiclass, this will be ignored;
        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
        scores for that label only.

    average : str or None, optional (default=``'multiclass'``)
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

        ``'binary'``:
            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
        ``'micro'``:
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
        ``'macro'``:
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
        ``'weighted'``:
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
        ``'samples'``:
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from
            :func:`accuracy_score`).

    sample_weight : ndarray, shape (n_samples, )
        Sample weights.

    correction: float, optional (default=0.0)
        Substitutes sensitivity of unrecognized classes from zero to a given
        value.

    Returns
    -------
    geometric_mean : float

    Examples
    --------
    >>> from imblearn.metrics import geometric_mean_score
    >>> y_true = [0, 1, 2, 0, 1, 2]
    >>> y_pred = [0, 2, 1, 0, 0, 1]
    >>> geometric_mean_score(y_true, y_pred)
    0.0
    >>> geometric_mean_score(y_true, y_pred, correction=0.001)
    0.010000000000000004
    >>> geometric_mean_score(y_true, y_pred, average='macro')
    0.47140452079103168
    >>> geometric_mean_score(y_true, y_pred, average='micro')
    0.47140452079103168
    >>> geometric_mean_score(y_true, y_pred, average='weighted')
    0.47140452079103168
    >>> geometric_mean_score(y_true, y_pred, average=None)
    array([ 0.8660254,  0.       ,  0.       ])

    References
    ----------
    .. [1] Kubat, M. and Matwin, S. "Addressing the curse of
       imbalanced training sets: one-sided selection" ICML (1997)

    .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies
       for learning in class imbalance problems", Pattern Recognition,
       36(3), (2003), pp 849-851.

    """
    if average is None or average != 'multiclass':
        sen, spe, _ = sensitivity_specificity_support(
            y_true,
            y_pred,
            labels=labels,
            pos_label=pos_label,
            average=average,
            warn_for=('specificity', 'specificity'),
            sample_weight=sample_weight)

        LOGGER.debug('The sensitivity and specificity are : %s - %s' %
                     (sen, spe))

        return np.sqrt(sen * spe)
    else:
        present_labels = unique_labels(y_true, y_pred)

        if labels is None:
            labels = present_labels
            n_labels = None
        else:
            n_labels = len(labels)
            labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
                                                     assume_unique=True)])

        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]

        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = bincount(tp_bins, weights=tp_bins_weights,
                              minlength=len(labels))
        else:
            # Pathological case
            true_sum = tp_sum = np.zeros(len(labels))
        if len(y_true):
            true_sum = bincount(y_true, weights=sample_weight,
                                minlength=len(labels))

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]

        recall = _prf_divide(tp_sum, true_sum, "recall", "true", None,
                             "recall")
        recall[recall == 0] = correction

        return sp.stats.mstats.gmean(recall)
def lda(X, y, tol=0.00001, n_components=None):
    """SVD solver.
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Training data.
    y : array-like, shape (n_samples,) or (n_samples, n_targets)
        Target values.
    """
    X, y = check_X_y(X, y)
    _, y_t = np.unique(y, return_inverse=True)
    priors_ = bincount(y_t) / float(len(y))
    classes_ = np.unique(y)
    n_samples, n_features = X.shape
    n_classes = len(classes_)

    print_dynamic('Calculate Class Mean')
    means_ = _class_means(X, y)

    Xc = []
    for idx, group in enumerate(classes_):
        Xg = X[y == group, :]
        Xc.append(Xg - means_[idx])

    xbar_ = np.dot(priors_, means_)

    Xc = np.concatenate(Xc, axis=0)

    print_dynamic('# 1) within (univariate) scaling by with classes std-dev')

    std = Xc.std(axis=0)
    # avoid division by zero in normalization
    std[std == 0] = 1.
    fac = 1. / (n_samples - n_classes)

    print_dynamic('# 2) Within variance scaling')
    X = np.sqrt(fac) * (Xc / std)
    # SVD of centered (within)scaled data
    U, S, V = linalg.svd(X, full_matrices=False)

    rank = np.sum(S > tol)
    if rank < n_features:
        warnings.warn("Variables are collinear.")
    # Scaling of within covariance is: V' 1/S
    scalings = (V[:rank] / std).T / S[:rank]

    print_dynamic('# 3) Between variance scaling')
    # Scale weighted centers
    X = np.dot(((np.sqrt((n_samples * priors_) * fac)) * (means_ - xbar_).T).T,
               scalings)
    # Centers are living in a space with n_classes-1 dim (maximum)
    # Use SVD to find projection in the space spanned by the
    # (n_classes) centers
    _, S, V = linalg.svd(X, full_matrices=0)

    rank = np.sum(S > tol * S[0])
    scalings_ = np.dot(scalings, V.T[:, :rank])
    coef = np.dot(means_ - xbar_, scalings_)
    intercept_ = (-0.5 * np.sum(coef**2, axis=1) + np.log(priors_))
    coef_ = np.dot(coef, scalings_.T)
    intercept_ -= np.dot(xbar_, coef_.T)

    return intercept_, coef_, classes_
Exemple #39
0
def sensitivity_specificity_support(y_true,
                                    y_pred,
                                    labels=None,
                                    pos_label=1,
                                    average=None,
                                    warn_for=('sensitivity', 'specificity'),
                                    sample_weight=None):
    """Compute sensitivity, specificity, and support for each class

    The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
    of true positives and ``fn`` the number of false negatives. The sensitivity
    quantifies the ability to avoid false negatives_[1].

    The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number
    of true negatives and ``fn`` the number of false negatives. The specificity
    quantifies the ability to avoid false positives_[1].

    The support is the number of occurrences of each class in ``y_true``.

    If ``pos_label is None`` and in binary classification, this function
    returns the average sensitivity and specificity if ``average``
    is one of ``'weighted'``.

    Parameters
    ----------
    y_true : ndarray, shape (n_samples, )
        Ground truth (correct) target values.

    y_pred : ndarray, shape (n_samples, )
        Estimated targets as returned by a classifier.

    labels : list, optional
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, for example to calculate a multiclass average ignoring a
        majority negative class, while labels not present in the data will
        result in 0 components in a macro average. For multilabel targets,
        labels are column indices. By default, all labels in ``y_true`` and
        ``y_pred`` are used in sorted order.

    pos_label : str or int, optional (default=1)
        The class to report if ``average='binary'`` and the data is binary.
        If the data are multiclass, this will be ignored;
        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
        scores for that label only.

    average : str or None, optional (default=None)
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

        ``'binary'``:
            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
        ``'micro'``:
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
        ``'macro'``:
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
        ``'weighted'``:
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
        ``'samples'``:
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from
            :func:`accuracy_score`).
    warn_for : tuple or set, for internal use
        This determines which warnings will be made in the case that this
        function is being used to return only one of its metrics.

    sample_weight : ndarray, shape (n_samples, )
        Sample weights.

    Returns
    -------
    sensitivity : float (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )

    specificity : float (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )

    support : int (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )
        The number of occurrences of each label in ``y_true``.

    Examples
    --------
    >>> import numpy as np
    >>> from imblearn.metrics import sensitivity_specificity_support
    >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
    >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
    >>> sensitivity_specificity_support(y_true, y_pred, average='macro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='micro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='weighted')
    (0.33333333333333331, 0.66666666666666663, None)

    References
    ----------
    .. [1] `Wikipedia entry for the Sensitivity and specificity
           <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_

    """
    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
    if average not in average_options and average != 'binary':
        raise ValueError('average has to be one of ' + str(average_options))

    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    present_labels = unique_labels(y_true, y_pred)

    if average == 'binary':
        if y_type == 'binary':
            if pos_label not in present_labels:
                if len(present_labels) < 2:
                    # Only negative labels
                    return (0., 0., 0)
                else:
                    raise ValueError("pos_label=%r is not a valid label: %r" %
                                     (pos_label, present_labels))
            labels = [pos_label]
        else:
            raise ValueError("Target is %s but average='binary'. Please "
                             "choose another average setting." % y_type)
    elif pos_label not in (None, 1):
        warnings.warn("Note that pos_label (set to %r) is ignored when "
                      "average != 'binary' (got %r). You may use "
                      "labels=[pos_label] to specify a single positive class."
                      % (pos_label, average), UserWarning)

    if labels is None:
        labels = present_labels
        n_labels = None
    else:
        n_labels = len(labels)
        labels = np.hstack(
            [labels, np.setdiff1d(
                present_labels, labels, assume_unique=True)])

    # Calculate tp_sum, pred_sum, true_sum ###

    if y_type.startswith('multilabel'):
        raise ValueError('imblearn does not support multilabel')
    elif average == 'samples':
        raise ValueError("Sample-based precision, recall, fscore is "
                         "not meaningful outside multilabel "
                         "classification. See the accuracy_score instead.")
    else:
        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]
        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = bincount(
                tp_bins, weights=tp_bins_weights, minlength=len(labels))
        else:
            # Pathological case
            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
        if len(y_pred):
            pred_sum = bincount(
                y_pred, weights=sample_weight, minlength=len(labels))
        if len(y_true):
            true_sum = bincount(
                y_true, weights=sample_weight, minlength=len(labels))

        # Compute the true negative
        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]
        pred_sum = pred_sum[indices]
        tn_sum = tn_sum[indices]

    if average == 'micro':
        tp_sum = np.array([tp_sum.sum()])
        pred_sum = np.array([pred_sum.sum()])
        true_sum = np.array([true_sum.sum()])
        tn_sum = np.array([tn_sum.sum()])

    # Finally, we have all our sufficient statistics. Divide! #

    with np.errstate(divide='ignore', invalid='ignore'):
        # Divide, and on zero-division, set scores to 0 and warn:

        # Oddly, we may get an "invalid" rather than a "divide" error
        # here.
        specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum,
                                  'specificity', 'predicted', average,
                                  warn_for)
        sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true',
                                  average, warn_for)

    # Average the results

    if average == 'weighted':
        weights = true_sum
        if weights.sum() == 0:
            return 0, 0, None
    elif average == 'samples':
        weights = sample_weight
    else:
        weights = None

    if average is not None:
        assert average != 'binary' or len(specificity) == 1
        specificity = np.average(specificity, weights=weights)
        sensitivity = np.average(sensitivity, weights=weights)
        true_sum = None  # return no support

    return sensitivity, specificity, true_sum
Exemple #40
0
def _parallel_build_balanced_estimators(n_estimators, ensemble, X, y, seeds,
                                        verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer))
            and (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer))
            and (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = random_state.randint(MAX_INT)
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        if bootstrap:
            indices = random_state.randint(0, n_samples, max_samples)
        else:
            indices = sample_without_replacement(n_samples,
                                                 max_samples,
                                                 random_state=random_state)

        sample_counts = bincount(indices, minlength=n_samples)

        # TEF: Main change in this next call to _downsample
        (Xbal, ybal) = _downsample((X[indices])[:, features], y[indices])
        estimator.fit(Xbal, ybal)
        samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
def grow_forest(forest, X, y, seeds, labels=None):
    """Grow a forest of random trees"""
    # Convert data
    X, = check_arrays(X, dtype=DTYPE, sparse_format="dense")
    # Make a list container for grown trees
    n_trees = forest.n_estimators
    trees = []
    # For each tree in the forest
    for i in range(n_trees):
        # Make a np.random.RandomState instance from the tree's planting seed
        random_state = check_random_state(seeds[i])
        # generate a random seed for a branching seed
        seed = random_state.randint(MAX_INT)
        # Make a decision tree object
        tree = forest._make_estimator(append=False)
        # Init the tree's RandomState instance with generated seed
        # this will randomize what features the tree will use
        tree.set_params(random_state=check_random_state(seed))
        # If we are bootstraping
        if forest.bootstrap:
            # If we are given labels
            if labels is not None:
                # Then need to bootstrap via labels
                # We can do this by using StratifiedShuffleSplit
                # to gain a random sample from each lable
                sss = cross_validation.StratifiedShuffleSplit(labels, 
                                             n_iter=1, 
                                             test_size=np.unique(labels).size, 
                                             random_state=check_random_state(seed))
                # Then we'll bootstrap our X and y for the lable samples chosen
                for train, test in sss:
                    X_lbs = X[test]
                    y_lbs = y[test]
                    break
                
                # Then get the number of samples
                n_samples = X_lbs.shape[0]
                # To generate a uniform sample weight
                curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
                # Then randomly choses n_samples from all samples with replacement 
                indices = random_state.randint(0, n_samples, n_samples)
                # Use this method of bincount to make a randome benning histogram
                # that will sum up to n_samples
                sample_counts = bincount(indices, minlength=n_samples)
                # Apply these randomized counts to the old uniform weights
                curr_sample_weight *= sample_counts
                # Fit the tree using these new sample weights
                tree.fit(X_lbs, y_lbs, sample_weight=curr_sample_weight, check_input=False)
                # Then set the indices of the tree only to the samples that had non-zero weights
                tree.indices_ = sample_counts > 0.
            else:
                # Then get the number of samples
                n_samples = X.shape[0]
                # To generate a uniform sample weight
                curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
                # Then randomly choses n_samples from all samples with replacement 
                indices = random_state.randint(0, n_samples, n_samples)
                # Use this method of bincount to make a randome benning histogram
                # that will sum up to n_samples
                sample_counts = bincount(indices, minlength=n_samples)
                # Apply these randomized counts to the old uniform weights
                curr_sample_weight *= sample_counts
                # Fit the tree using these new sample weights
                tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
                # Then set the indices of the tree only to the samples that had non-zero weights
                tree.indices_ = sample_counts > 0.
        # If we aren't bootstraping
        else:
            # This just fit the data with no random weights
            tree.fit(X, y, check_input=False)
        # Add the grown tree to the container 
        trees.append(tree)
    # return all of the trained trees
    return trees
def _parallel_build_ranking_estimators(n_estimators, ensemble, X, y, Q, sample_weight, seeds, verbose):
    """Private function used to build a batch of estimators within a job.
    Now it supports queries and querywise sampling.
    It also breaks the PEP8 line length constraint now"""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features
    uQueries = np.unique(Q)

    sample_whole_queries = False
    if hasattr(ensemble, "sample_whole_queries"):
        sample_whole_queries = ensemble.sample_whole_queries

    if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0):
        if sample_whole_queries:
            max_samples = int(max_samples * len(uQueries))
        else:
            max_samples = int(max_samples * n_samples)

    if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap

    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight")

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features, max_features, random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                if sample_whole_queries:
                    Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                if sample_whole_queries:
                    notQindices = uQueries[random_state.randint(0, len(uQueries), len(uQueries) - max_samples)]
                    notQindices.sort()
                    not_indices = reduce(np.append, [np.where(Q == i) for i in Qindices])
                else:
                    not_indices = sample_without_replacement(
                        n_samples, n_samples - max_samples, random_state=random_state
                    )

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, Q=Q, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.0

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                if sample_whole_queries:
                    Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = random_state.randint(0, n_samples, max_samples)
            else:
                if sample_whole_queries:
                    Qindices = uQueries[
                        sample_without_replacement(len(uQueries), max_samples, random_state=random_state)
                    ]
                    Qindices.sort()
                    indices = reduce(np.append, [np.where(Q == i) for i in Qindices])

                else:
                    indices = sample_without_replacement(n_samples, max_samples, random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices], Q=Q[indices])
            samples = sample_counts > 0.0

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
Exemple #43
0
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
            (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer)) and
            (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = ("sample_weight" in
                             getargspec(ensemble.base_estimator_.fit)[0])

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try: # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(
                    n_samples,
                    n_samples - max_samples,
                    random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples,
                                                     max_samples,
                                                     random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices])
            samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
Exemple #44
0
    def _make_test_folds(self, frame, y):
        if self.shuffle:
            rng = check_random_state(self.random_state)
        else:
            rng = self.random_state

        # validate that it's a string
        y = _val_y(y)  # gets a string back or None
        if y is None:
            raise ValueError('H2OStratifiedKFold requires a target name (got None)')

        target = frame[y].as_data_frame(use_pandas=True)[y].values
        n_samples = target.shape[0]
        unique_y, y_inversed = np.unique(target, return_inverse=True)
        y_counts = bincount(y_inversed)
        min_labels = np.min(y_counts)

        if np.all(self.n_folds > y_counts):
            raise ValueError(('All the n_labels for individual classes'
                              ' are less than %d folds.'
                              % self.n_folds), Warning)
        if self.n_folds > min_labels:
            warnings.warn(('The least populated class in y has only %d'
                           ' members, which is too few. The minimum'
                           ' number of labels for any class cannot'
                           ' be less than n_folds=%d.'
                           % (min_labels, self.n_folds)), Warning)

        # NOTE FROM SKLEARN:

        # pre-assign each sample to a test fold index using individual KFold
        # splitting strategies for each class so as to respect the balance of
        # classes
        # NOTE: Passing the data corresponding to ith class say X[y==class_i]
        # will break when the data is not 100% stratifiable for all classes.
        # So we pass np.zeroes(max(c, n_folds)) as data to the KFold.

        # Remember, however that we might be using the old-fold KFold which doesn't
        # have a split method...
        if SK18:
            per_cls_cvs = [
                KFold(self.n_folds,  # using sklearn's KFold here
                      shuffle=self.shuffle,
                      random_state=rng).split(np.zeros(max(count, self.n_folds)))
                for count in y_counts
                ]
        else:
            per_cls_cvs = [
                KFold(max(count, self.n_folds),  # using sklearn's KFold here
                      self.n_folds,
                      shuffle=self.shuffle,
                      random_state=rng)
                for count in y_counts
                ]

        test_folds = np.zeros(n_samples, dtype=np.int)
        for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
            for cls, (_, test_split) in zip(unique_y, per_cls_splits):
                cls_test_folds = test_folds[target == cls]

                # the test split can be too big because we used
                # KFold(...).split(X[:max(c, n_folds)]) when data is not 100%
                # stratifiable for all the classes
                # (we use a warning instead of raising an exception)
                # If this is the case, let's trim it:
                test_split = test_split[test_split < len(cls_test_folds)]
                cls_test_folds[test_split] = test_fold_indices
                test_folds[target == cls] = cls_test_folds

        return test_folds
Exemple #45
0
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""
    # Retrieve settings
    n_samples, n_features = X.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer))
            and (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer))
            and (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    support_sample_weight = ("sample_weight"
                             in getargspec(ensemble.base_estimator_.fit)[0])

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples, ))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(
                    n_samples,
                    n_samples - max_samples,
                    random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples,
                                                     max_samples,
                                                     random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            estimator.fit((X[indices])[:, features], y[indices])
            samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
Exemple #46
0
def _parallel_build_estimators(n_estimators, ensemble, all_X, all_y, sample_weight,
                               seeds, verbose):
    """Private function used to build a batch of estimators within a job."""

    positives = np.where(all_y == 1)[0]
    unlabeled = np.where(all_y == 0)[0]
    
    X_positives = all_X[positives]
    X_unlabeled = all_X[unlabeled]
    y_positives = all_y[positives]
    y_unlabeled = all_y[unlabeled]

    # Retrieve settings
    n_samples, n_features = X_unlabeled.shape
    max_samples = ensemble.max_samples
    max_features = ensemble.max_features

    if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
            (0.0 < max_samples <= 1.0)):
        max_samples = int(max_samples * n_samples)

    if (not isinstance(max_features, (numbers.Integral, np.integer)) and
            (0.0 < max_features <= 1.0)):
        max_features = int(max_features * n_features)

    bootstrap = ensemble.bootstrap
    bootstrap_features = ensemble.bootstrap_features
    
        #can't currently support sample weights
    if sample_weight is not None:
        raise ValueError("Can't currently support sample weight with PUBagging")

    support_sample_weight = False
    #support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
     #                                         "sample_weight")
    #if not support_sample_weight and sample_weight is not None:
     #   raise ValueError("The base estimator doesn't support sample weight")

    # Build estimators
    estimators = []
    estimators_samples = []
    estimators_features = []

    for i in range(n_estimators):
        if verbose > 1:
            print("building estimator %d of %d" % (i + 1, n_estimators))

        random_state = check_random_state(seeds[i])
        seed = check_random_state(random_state.randint(MAX_INT))
        estimator = ensemble._make_estimator(append=False)

        try:  # Not all estimator accept a random_state
            estimator.set_params(random_state=seed)
        except ValueError:
            pass

        # Draw features
        if bootstrap_features:
            features = random_state.randint(0, n_features, max_features)
        else:
            features = sample_without_replacement(n_features,
                                                  max_features,
                                                  random_state=random_state)

        # Draw samples, using sample weights, and then fit
        if support_sample_weight:
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,))
            else:
                curr_sample_weight = sample_weight.copy()

            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
                sample_counts = bincount(indices, minlength=n_samples)
                curr_sample_weight *= sample_counts

            else:
                not_indices = sample_without_replacement(
                    n_samples,
                    n_samples - max_samples,
                    random_state=random_state)

                curr_sample_weight[not_indices] = 0

            estimator.fit(all_X[:, features], all_y, sample_weight=curr_sample_weight)
            samples = curr_sample_weight > 0.

        # Draw samples, using a mask, and then fit
        else:
            if bootstrap:
                indices = random_state.randint(0, n_samples, max_samples)
            else:
                indices = sample_without_replacement(n_samples,
                                                     max_samples,
                                                     random_state=random_state)

            sample_counts = bincount(indices, minlength=n_samples)

            new_X=np.vstack((X_positives, X_unlabeled[indices]))
            new_y=np.concatenate((y_positives, y_unlabeled[indices]))

            estimator.fit(new_X[:, features], new_y)
            samples = sample_counts > 0.

        estimators.append(estimator)
        estimators_samples.append(samples)
        estimators_features.append(features)

    return estimators, estimators_samples, estimators_features
Exemple #47
0
def test_sample_weight():
    """Check sample weighting."""
    # Test that zero-weighted samples are not taken into account
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    sample_weight = np.ones(100)
    sample_weight[y == 0] = 0.0

    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(clf.predict(X), np.ones(100))

    # Test that low weighted samples are not taken into account at low depth
    X = np.arange(200)[:, np.newaxis]
    y = np.zeros(200)
    y[50:100] = 1
    y[100:200] = 2
    X[100:200, 0] = 200

    sample_weight = np.ones(200)

    sample_weight[y == 2] = .51  # Samples of class '2' are still weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 149.5)

    sample_weight[y == 2] = .50  # Samples of class '2' are no longer weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 49.5)  # Threshold should have moved

    # Test that sample weighting is the same as having duplicates
    X = iris.data
    y = iris.target

    duplicates = rng.randint(0, X.shape[0], 1000)

    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X[duplicates], y[duplicates])

    from sklearn.utils.fixes import bincount
    sample_weight = bincount(duplicates, minlength=X.shape[0])
    clf2 = tree.DecisionTreeClassifier(random_state=1)
    clf2.fit(X, y, sample_weight=sample_weight)

    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
    assert_array_equal(clf.tree_.threshold[internal],
                       clf2.tree_.threshold[internal])

    # Test negative weights
    X = iris.data
    y = iris.target

    sample_weight = -np.ones(X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)

    sample_weight = np.ones(X.shape[0])
    sample_weight[0] = -1
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)

    # Check that predict_proba returns valid probabilities in the presence of
    # samples with negative weight
    X = iris.data
    y = iris.target

    sample_weight = rng.normal(.5, 1.0, X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)
    proba = clf.predict_proba(X)
    assert (proba >= 0).all() and (proba <= 1).all()
def label_ranking_loss(y_true, y_score, sample_weight=None):
    """Compute Ranking loss measure

    Compute the average number of label pairs that are incorrectly ordered
    given y_score weighted by the size of the label set and the number of
    labels not in the label set.

    This is similar to the error set size, but weighted by the number of
    relevant and irrelevant labels. The best performance is achieved with
    a ranking loss of zero.

    Read more in the :ref:`User Guide <label_ranking_loss>`.

    .. versionadded:: 0.17
       A function *label_ranking_loss*

    Parameters
    ----------
    y_true : array or sparse matrix, shape = [n_samples, n_labels]
        True binary labels in binary indicator format.

    y_score : array, shape = [n_samples, n_labels]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers).

    sample_weight : array-like of shape = [n_samples], optional
        Sample weights.

    Returns
    -------
    loss : float

    References
    ----------
    .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).
           Mining multi-label data. In Data mining and knowledge discovery
           handbook (pp. 667-685). Springer US.

    """
    y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr')
    y_score = check_array(y_score, ensure_2d=False)
    check_consistent_length(y_true, y_score, sample_weight)

    y_type = type_of_target(y_true)
    if y_type not in ("multilabel-indicator", ):
        raise ValueError("{0} format is not supported".format(y_type))

    if y_true.shape != y_score.shape:
        raise ValueError("y_true and y_score have different shape")

    n_samples, n_labels = y_true.shape

    y_true = csr_matrix(y_true)

    loss = np.zeros(n_samples)
    for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):
        # Sort and bin the label scores
        unique_scores, unique_inverse = np.unique(y_score[i],
                                                  return_inverse=True)
        true_at_reversed_rank = bincount(
            unique_inverse[y_true.indices[start:stop]],
            minlength=len(unique_scores))
        all_at_reversed_rank = bincount(unique_inverse,
                                        minlength=len(unique_scores))
        false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank

        # if the scores are ordered, it's possible to count the number of
        # incorrectly ordered paires in linear time by cumulatively counting
        # how many false labels of a given score have a score higher than the
        # accumulated true labels with lower score.
        loss[i] = np.dot(true_at_reversed_rank.cumsum(),
                         false_at_reversed_rank)

    n_positives = count_nonzero(y_true, axis=1)
    with np.errstate(divide="ignore", invalid="ignore"):
        loss /= ((n_labels - n_positives) * n_positives)

    # When there is no positive or no negative labels, those values should
    # be consider as correct, i.e. the ranking doesn't matter.
    loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.

    return np.average(loss, weights=sample_weight)