def feature_importance(rf, X, y, type = 'oob', normalized = False, balanced = False, demean=False,normal_fX = False):
    n_samples, n_features = X.shape
    if len(y.shape) != 2:
        raise ValueError('y must be 2d array (n_samples, 1) if numerical or (n_samples, n_categories).')
    out = np.zeros((n_features,))
    SE = np.zeros((n_features,))
    if demean:
        # demean y
        y = y - np.mean(y, axis=0)
        
    for tree in rf.estimators_:
        if type == 'oob':
            if rf.bootstrap:
                indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples)
            else:
                raise ValueError('Without bootstrap, it is not possible to calculate oob.')
        elif type == 'test':
            indices = np.arange(n_samples)
        elif type == 'classic':
            if rf.bootstrap:
                indices = _generate_sample_indices(tree.random_state, n_samples, n_samples)
            else:
                indices = np.arange(n_samples)
        else:
            raise ValueError('type is not recognized. (%s)'%(type))
        _, _, contributions = _predict_tree(tree, X[indices,:])
        if balanced and (type == 'oob' or type == 'test'):
            base_indices = _generate_sample_indices(tree.random_state, n_samples, n_samples)
            ids = tree.apply(X[indices, :])
            base_ids = tree.apply(X[base_indices, :])
            tmp1, tmp2 = np.unique(ids, return_counts = True)
            weight1 = {key: 1. / value for key, value in zip(tmp1, tmp2)}
            tmp1, tmp2 = np.unique(base_ids, return_counts = True)
            weight2 = {key: value for key, value in zip(tmp1, tmp2)}
            final_weights = np.array([[weight1[id] * weight2[id]] for id in ids])
            final_weights /= np.mean(final_weights)
        else:
            final_weights = 1
        if len(contributions.shape) == 2:
            contributions = contributions[:,:,np.newaxis]
        #print(contributions.shape, y[indices,:].shape)
        if normal_fX:
            for k in range(contributions.shape[-1]):
                contributions[:, :, k] = scale(contributions[:, :, k]) 
        tmp =  np.tensordot(np.array(y[indices,:]) * final_weights, contributions, axes=([0, 1], [0, 2])) 
        if normalized:
            out +=  tmp / sum(tmp)
        else:
            out += tmp / len(indices)
        if normalized:
            SE += (tmp / sum(tmp)) ** 2
        else:
            SE += (tmp / len(indices)) ** 2
    out /= rf.n_estimators
    SE /= rf.n_estimators
    SE = ((SE - out ** 2) / rf.n_estimators) ** .5 
    return out, SE
Esempio n. 2
0
def _parallel_build_trees(tree,
                          forest,
                          X,
                          y,
                          sample_weight,
                          tree_idx,
                          n_trees,
                          verbose=0,
                          class_weight=None):
    """Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples)
        tree.used_indices = indices
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight is not None:
            raise RuntimeError(
                "not compatible with the hacked parallel_build_trees")

        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    else:
        tree.fit(X, y, sample_weight=sample_weight, check_input=False)

    return tree
def rf_accuracy(rf, X, y, type = 'oob', metric = 'accuracy'):
    if metric == 'accuracy':
        score = accuracy_score
    elif metric == 'mse':
        score = neg_mse
    else:
        raise ValueError('metric type not understood')

    n_samples, n_features = X.shape
    tmp = 0
    count = 0
    if type == 'test':
        return score(y, rf.predict(X))
    elif type == 'train' and not rf.bootstrap:
        return score(y, rf.predict(X))

    for tree in rf.estimators_:
        if type == 'oob':
            if rf.bootstrap:
                indices = _generate_unsampled_indices(tree.random_state, n_samples, n_samples)
            else:
                raise ValueError('Without bootstrap, it is not possible to calculate oob.')
        elif type == 'train':
            indices = _generate_sample_indices(tree.random_state, n_samples, n_samples)
        else:
            raise ValueError('type is not recognized. (%s)'%(type))
        tmp +=  score(y[indices,:], tree.predict(X[indices, :])) * len(indices) 
        count += len(indices)
    return tmp / count
def test_class_prob():
    """ testing class probabilities from Random forests """
    n_trees = 100
    num_classes = 20
    X, y = make_blobs(n_samples=1000,
                      centers=num_classes,
                      random_state=2,
                      cluster_std=2.0)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)
    forest = get_models('RandomForest', 'classify')
    forest.set_params(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    y_pred_forest = forest.predict(X_test)
    prob_val_all = np.zeros(shape=(len(y_test), num_classes))
    n_samples = X_train.shape[0]
    print('Diff over all trees :')
    for t, estimator in enumerate(forest):
        sample_indices = _generate_sample_indices(estimator.random_state,
                                                  n_samples)
        y_tree_predict = estimator.predict(X_test)
        class_prob = get_class_prob(estimator)
        test_leaves_id = estimator.apply(X_test)
        y_tree_mine = class_prob[test_leaves_id, :]
        diff = np.linalg.norm(y_tree_predict - np.argmax(y_tree_mine, axis=1))
        print("%.2f" % round(diff, 2), end=', ')
        prob_val_all += y_tree_mine  #n_nodes, num_classes
    print('')
    prob_val_all = prob_val_all / n_trees
    y_pred_mine_rf = np.argmax(prob_val_all, axis=1)
    print('% Predictions diff = ')
    print(np.linalg.norm(y_pred_forest - y_pred_mine_rf))
    return
def calc_inbag(n_samples, forest):
    """
    Derive samples used to create trees in scikit-learn RandomForest objects.

    Recovers the samples in each tree from the random state of that tree using
    :func:`forest._generate_sample_indices`.

    Parameters
    ----------
    n_samples : int
        The number of samples used to fit the scikit-learn RandomForest object.

    forest : RandomForest
        Regressor or Classifier object that is already fit by scikit-learn.

    Returns
    -------
    Array that records how many times a data point was placed in a tree.
    Columns are individual trees. Rows are the number of times a sample was
    used in a tree.
    """
    n_trees = forest.n_estimators
    inbag = np.zeros((n_samples, n_trees))
    sample_idx = []
    for t_idx in range(n_trees):
        sample_idx.append(
          _generate_sample_indices(forest.estimators_[t_idx].random_state,
                                   n_samples))
        inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples)
    return inbag
Esempio n. 6
0
 def calculate_inbag(forest, n_samples):
     n_trees = forest.n_estimators
     inbag = np.zeros((n_samples, n_trees))
     for t_idx in range(n_trees):
         sample_idx = _generate_sample_indices(
             forest.estimators_[t_idx].random_state, n_samples)
         inbag[:, t_idx] = np.bincount(sample_idx, minlength=n_samples)
     return inbag
def test_regress_forest():
    """ testing Random forests regression predict function """
    n_trees = 4
    boston = load_boston()
    X = boston.data
    y = boston.target

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)

    #    X_train = np.array([range(1,4),range(4,7)])
    #    y_train = np.array([9,5])
    #    X_test = X_train
    #    y_test = y_train
    print('Single regression tree test : ')
    estimator = DecisionTreeRegressor()
    estimator.fit(X_train, y_train)
    y_pred_dt = estimator.predict(X_test)

    node_indicator = estimator.decision_path(X_train)
    mean_vals, _ = get_node_means(node_indicator, y_train)

    test_leaves_id = estimator.apply(X_test)
    y_pred_mine_dt = mean_vals[test_leaves_id]
    diff = np.linalg.norm(y_pred_dt - y_pred_mine_dt)
    print('Tree predictions diff :' + repr(diff))

    print('Regression Forest Test : ')
    forest = get_models('RandomForest', 'regress')
    forest.set_params(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    y_pred_all = np.zeros(shape=(len(y_test)))
    n_samples = X_train.shape[0]
    indicator, n_nodes_ptr = forest.decision_path(X_train)
    for t, estimator in enumerate(forest):
        t_idx = _generate_sample_indices(estimator.random_state, n_samples)
        y_tree_predict = estimator.predict(X_test)
        print('Num nodes = ' + repr(estimator.tree_.node_count))
        node_indicator = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]]
        #        node_indicator = estimator.decision_path(X_train)
        mean_vals, _ = get_node_means(node_indicator, y_train[t_idx])
        leaves_id = estimator.apply(X_test)
        y_tree_mine = mean_vals[leaves_id]
        diff = np.linalg.norm(y_tree_predict - y_tree_mine)
        #        print(y_tree_predict, y_tree_mine)
        print('Tree#' + repr(t) + ': Diff = ' + repr(diff))
        y_pred_all += y_tree_mine
    y_pred_rf = forest.predict(X_test)
    y_pred_mine_rf = y_pred_all / n_trees
    diff = np.linalg.norm(y_pred_rf - y_pred_mine_rf)
    print('Forest predictions difference :' + repr(diff))
    print('#BUG#-->Trees in the forest dont match my tree predictions')
    return
Esempio n. 8
0
def calc_inbag(n_samples, forest):
    """

    """
    n_trees = forest.n_estimators
    inbag = np.zeros((n_samples, n_trees))
    sample_idx = []
    for t_idx in range(n_trees):
        sample_idx.append(_generate_sample_indices(forest.estimators_[t_idx].random_state,
                                                   n_samples))
        inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples)
    return inbag
Esempio n. 9
0
    def _parallel_build_trees(self, tree, forest, X, y, sample_weight, tree_idx, n_trees,
                              verbose=0, class_weight=None):
        """
        Private function used to fit a single tree in parallel.

        Copied from sklearn.ensemble.forest and converted to a class function to perform undersampling prior to
        fitting the single tree

        :param tree: base_estimator {default=DecisionTreeClassifier()}
        :param forest: self {BalancedRandomForestClassifier object}
        :param X:{array-like, sparse matrix}, shape (n_samples, n_features)
               Matrix containing the training data.
        :param y: array-like, shape (n_samples,)
               Corresponding label for each sample in X.
        :param sample_weight: array-like of shape = [n_samples], optional
               Sample weights.
        :param tree_idx: index for specific tree
        :param n_trees: total number of trees
        :param verbose: int, optional (default=0)
               Controls the verbosity of the building process.
        :param class_weight: dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None)
               Weights associated with classes in the form ``{class_label: weight}``.
               If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts
               can be provided in the same order as the columns of y.
        :return: fitted tree
        """
        if verbose > 1:
            print("building tree %d of %d" % (tree_idx + 1, n_trees))

        X_res, y_res, indices = self.rus.fit_sample(X, y)
        if forest.bootstrap:
            n_samples = X_res.shape[0]
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
            else:
                curr_sample_weight = sample_weight[indices]

            indices = _generate_sample_indices(tree.random_state, n_samples)
            sample_counts = np.bincount(indices, minlength=n_samples)
            curr_sample_weight *= sample_counts

            if class_weight == 'subsample':
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore', DeprecationWarning)
                    curr_sample_weight *= compute_sample_weight('auto', y, indices)
            elif class_weight == 'balanced_subsample':
                curr_sample_weight *= compute_sample_weight('balanced', y, indices)

            tree.fit(X_res, y_res, sample_weight=curr_sample_weight, check_input=False)
        else:
            tree.fit(X_res, y_res, sample_weight=sample_weight, check_input=False)

        return tree
Esempio n. 10
0
def calc_inbag(n_samples, forest):
    """

    """
    n_trees = forest.n_estimators
    inbag = np.zeros((n_samples, n_trees))
    sample_idx = []
    for t_idx in range(n_trees):
        sample_idx.append(
            _generate_sample_indices(forest.estimators_[t_idx].random_state,
                                     n_samples))
        inbag[:, t_idx] = np.bincount(sample_idx[-1], minlength=n_samples)
    return inbag
Esempio n. 11
0
def _parallel_build_trees(tree,
                          forest,
                          X,
                          y,
                          sample_weight,
                          tree_idx,
                          n_trees,
                          verbose=0,
                          class_weight=None,
                          n_samples_bootstrap=None):
    """Private function used to fit a single tree in parallel, adjusted for pipeline trees."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    # name of step of final estimator in pipeline
    estimator = tree.steps[-1][0]

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples,
                                           n_samples_bootstrap)
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with catch_warnings():
                simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto', y, indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced', y, indices)

        fit_params = {
            f'{estimator}__sample_weight': curr_sample_weight,
            f'{estimator}__check_input': True
        }
        tree.fit(X, y, **fit_params)

    else:
        fit_params = {
            f'{estimator}__sample_weight': sample_weight,
            f'{estimator}__check_input': True
        }
        tree.fit(X, y, **fit_params)

    return tree
def _parallel_build_trees_under(tree,
                                forest,
                                X,
                                y,
                                sample_weight,
                                tree_idx,
                                n_trees,
                                verbose=0,
                                class_weight=None):
    """Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    # Undersample X and y first
    if forest.undersample is not None:
        rus = RandomUnderSampler(ratio=lambda y: {
            0: int(Counter(y)[0] / forest.undersample),
            1: Counter(y)[1]
        },
                                 return_indices=True)
        X, y, indices_under = rus.fit_sample(X, y)
        if sample_weight is not None:
            sample_weight = sample_weight[indices_under]

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples)
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto', y, indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced', y, indices)

        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    else:
        tree.fit(X, y, sample_weight=sample_weight, check_input=False)

    return tree
Esempio n. 13
0
    def check_oob(self, x, y):
        n_samples = y.shape[0]
        in_sample_tensor = numpy.zeros(shape=(
            len(self.dt_classifier.estimators_),
            x.shape[0],
        ))
        out_sample_tensor = numpy.zeros(shape=(
            len(self.dt_classifier.estimators_),
            x.shape[0],
        ))

        for i, estimator in enumerate(self.dt_classifier.estimators_):
            unsampled_indices = _generate_unsampled_indices(
                estimator.random_state, n_samples)
            sampled_indices = _generate_sample_indices(
                estimator.random_state, n_samples)

            assert len(set(unsampled_indices) & set(sampled_indices)) == 0

            unsampled_estimated = estimator.predict(x[unsampled_indices, :])
            unsampled_real = y[unsampled_indices]
            sample_estimated = estimator.predict(x[sampled_indices, :])
            sample_real = y[sampled_indices]

            out_sample_success = numpy.where(unsampled_estimated.astype(int) == unsampled_real)
            out_sample_fail = numpy.where(unsampled_estimated.astype(int) != unsampled_real)
            out_sample_success_indices = unsampled_indices[out_sample_success]
            out_sample_fail_indices = unsampled_indices[out_sample_fail]
            out_sample_tensor[i, out_sample_success_indices] = 1.0
            out_sample_tensor[i, out_sample_fail_indices] = -1.0

            in_sample_success = numpy.where(sample_estimated.astype(int) == sample_real)
            in_sample_fail = numpy.where(sample_estimated.astype(int) != sample_real)
            in_sample_success_indices = sampled_indices[in_sample_success]
            in_sample_fail_indices = sampled_indices[in_sample_fail]
            in_sample_tensor[i, in_sample_success_indices] = 1.0
            in_sample_tensor[i, in_sample_fail_indices] = -1.0

        return in_sample_tensor, out_sample_tensor, y
Esempio n. 14
0
                           n_redundant=0,
                           random_state=0,
                           shuffle=False)

feature_names = ['x' + str(i) for i in range(X.shape[1])]
data = pd.DataFrame(data=X, columns=feature_names)
data['y'] = y

# fit a random forest
clf = RandomForestClassifier(n_estimators=100,
                             max_depth=2,
                             random_state=0,
                             oob_score=True)

clf.fit(data[feature_names], data['y'])

# print some stuff about it
print(clf.feature_importances_)
print
print(clf.apply([[0, 0, 0, 0]]))

# this can be used as estimate of the propensity score. It is only produced if
# the argument *oob_score* is passed to the constructor. It is possible that it
# is NaN if an observation makes it into all trees
print(clf.oob_decision_function_)

# We can find out which rows of X were/were not used in a tree using the random
# state attributed of the tree like...
print(_generate_sample_indices(clf.estimators_[0].random_state, X.shape[0]))
print(_generate_unsampled_indices(clf.estimators_[0].random_state, X.shape[0]))
        forest = RandomForestRegressor(n_estimators=n_trees, oob_score=True)

    oob_indices, oob_leaves_id, OOB_tree_indicator = {}, {}, {}
    #fit
    forest.fit(X_train, y_train)
    forest_oob_score = forest.oob_score_

    n_trees, train_size = forest.n_estimators, len(y_train)
    indicator, n_nodes_ptr = forest.decision_path(X_train)
    node_indicator = {}
    sample_index = {}
    for t, estimator in enumerate(forest):
        oob_indices[t] = _generate_unsampled_indices(estimator.random_state,
                                                     X_train.shape[0])
        oob_leaves_id[t] = estimator.apply(X_train[oob_indices[t], :])
        sample_index[t] = _generate_sample_indices(estimator.random_state,
                                                   n_samples)
        node_indicator[t] = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]]
    mean_vals = {}
    for t in range(n_trees):
        mean_vals[t] = np.zeros(node_indicator[t].shape[1])
        for node in range(node_indicator[t].shape[1]):
            r, c = node_indicator[t][:, node].nonzero()
            mean_vals[t][node] = np.mean(y_train[sample_index[t]][r])

    alpha_list, _, node_score = get_alpha(forest, X_train, y_train,
                                          predicttype)

    y_pred_oob = np.zeros(len(y_train))
    print('Forest size, trees : ' + repr(get_forest_size(forest)) + ',' +
          repr(n_trees))
    if (predicttype == 'classify'):