Example #1
0
 def _fit_stacking_model(self,X, y, cost_mat, max_iter=100):
     """Private function used to fit the stacking model."""
     self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter)
     X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_,
                                       self.estimators_weight_, X, self.combination)
     self.f_staking.fit(X_stacking, y, cost_mat)
     return self
Example #2
0
def train(class_index):
    docs_bin = read_train("../Data/train-data.dat")
    X_train = tfIdf(docs_bin)

    y_train = load_labels("../Data/train-label.dat", class_index)

    cost_mat_train = calculate_cost_matrix(y_train)

    f = CostSensitiveLogisticRegression()
    f.fit(X_train, y_train, cost_mat_train)
    return f
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
    """Base class for Bagging meta-estimator.

    Warning: This class should not be used directly. Use derived classes
    instead.
    """
    @abstractmethod
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 max_samples=1.0,
                 max_features=1.0,
                 bootstrap=True,
                 bootstrap_features=False,
                 combination='majority_voting',
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(BaseBagging, self).__init__(base_estimator=base_estimator,
                                          n_estimators=n_estimators)

        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.combination = combination
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y, cost_mat, sample_weight=None):
        """Build a Bagging ensemble of estimators from the training set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).

        cost_mat : array-like of shape = [n_samples, 4]
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if the base estimator supports
            sample weighting.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        # X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])  # Not in sklearn verion 0.15

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.max_samples, (numbers.Integral, np.integer)):
            max_samples = self.max_samples
        else:  # float
            max_samples = int(self.max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        # Free allocated memory, if any
        self.estimators_ = None

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(
            self.n_estimators, self.n_jobs)
        seeds = random_state.randint(MAX_INT, size=self.n_estimators)

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_build_estimators)(n_estimators[i],
                                                self,
                                                X,
                                                y,
                                                cost_mat,
                                                seeds[starts[i]:starts[i + 1]],
                                                verbose=self.verbose)
            for i in range(n_jobs))

        # Reduce
        self.estimators_ = list(
            itertools.chain.from_iterable(t[0] for t in all_results))
        self.estimators_samples_ = list(
            itertools.chain.from_iterable(t[1] for t in all_results))
        self.estimators_features_ = list(
            itertools.chain.from_iterable(t[2] for t in all_results))

        self._evaluate_oob_savings(X, y, cost_mat)

        if self.combination in [
                'stacking', 'stacking_proba', 'stacking_bmr',
                'stacking_proba_bmr'
        ]:
            self._fit_stacking_model(X, y, cost_mat)

        if self.combination in [
                'majority_bmr', 'weighted_bmr', 'stacking_bmr',
                'stacking_proba_bmr'
        ]:
            self._fit_bmr_model(X, y)

        return self

    def _fit_bmr_model(self, X, y):
        """Private function used to fit the BayesMinimumRisk model."""
        self.f_bmr = BayesMinimumRiskClassifier()
        X_bmr = self.predict_proba(X)
        self.f_bmr.fit(y, X_bmr)
        return self

    def _fit_stacking_model(self, X, y, cost_mat, max_iter=100):
        """Private function used to fit the stacking model."""
        self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose,
                                                         max_iter=max_iter)
        X_stacking = _create_stacking_set(self.estimators_,
                                          self.estimators_features_,
                                          self.estimators_weight_, X,
                                          self.combination)
        self.f_staking.fit(X_stacking, y, cost_mat)
        return self

    #TODO: _evaluate_oob_savings in parallel
    def _evaluate_oob_savings(self, X, y, cost_mat):
        """Private function used to calculate the OOB Savings of each estimator."""
        estimators_weight = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):
            # Test if all examples where used for training
            if not np.any(~samples):
                # Then use training
                oob_pred = estimator.predict(X[:, features])
                oob_savings = max(0, savings_score(y, oob_pred, cost_mat))
            else:
                # Then use OOB
                oob_pred = estimator.predict((X[~samples])[:, features])
                oob_savings = max(
                    0, savings_score(y[~samples], oob_pred,
                                     cost_mat[~samples]))

            estimators_weight.append(oob_savings)

        # Control in case were all weights are 0
        if sum(estimators_weight) == 0:
            self.estimators_weight_ = np.ones(
                len(estimators_weight)) / len(estimators_weight)
        else:
            self.estimators_weight_ = (np.array(estimators_weight) /
                                       sum(estimators_weight)).tolist()

        return self

    def _validate_y(self, y):
        # Default implementation
        return column_or_1d(y, warn=True)
Example #4
0
cost_mat_train, cost_mat_test = cost_mat[:ratio], cost_mat[ratio:]

y_train, y_test, = np.argmax(y_train, axis=1), np.argmax(y_test, axis=1)

print y_train.shape, y_test.shape

#random forest
rfc = RandomForestClassifier(random_state=0).fit(x_train, y_train)
y_pred_test_rf = rfc.predict(x_test)

print evaluate(y_pred_test_rf, y_test, cost_mat_test)

#logistic regression
lr = LogisticRegression(random_state=0).fit(x_train, y_train)
y_pred_test_lr = lr.predict(x_test)

print evaluate(y_pred_test_lr, y_test, cost_mat_test)

#cost-sensitive decision trees
CSDT = CostSensitiveDecisionTreeClassifier().fit(x_train, y_train,
                                                 cost_mat_train)
y_pred_test_csdt = CSDT.predict(x_test)

print evaluate(y_pred_test_csdt, y_test, cost_mat_test)

#cost-sensitive lr
CSLR = CostSensitiveLogisticRegression()
CSLR.fit(x_train, y_train, cost_mat_train)
y_pred_test_cslr = CSLR.predict(x_test)

print evaluate(y_pred_test_cslr, y_test, cost_mat_test)
class BaseBagging(with_metaclass(ABCMeta, BaseEnsemble)):
    """Base class for Bagging meta-estimator.

    Warning: This class should not be used directly. Use derived classes
    instead.
    """

    @abstractmethod
    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 max_samples=1.0,
                 max_features=1.0,
                 bootstrap=True,
                 bootstrap_features=False,
                 combination='majority_voting',
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(BaseBagging, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators)

        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        self.combination = combination
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y, cost_mat, sample_weight=None):
        """Build a Bagging ensemble of estimators from the training set (X, y).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.

        y : array-like, shape = [n_samples]
            The target values (class labels in classification, real numbers in
            regression).

        cost_mat : array-like of shape = [n_samples, 4]
            Cost matrix of the classification problem
            Where the columns represents the costs of: false positives, false negatives,
            true positives and true negatives, for each example.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if the base estimator supports
            sample weighting.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Convert data
        # X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])  # Not in sklearn verion 0.15

        # Remap output
        n_samples, self.n_features_ = X.shape
        y = self._validate_y(y)

        # Check parameters
        self._validate_estimator()

        if isinstance(self.max_samples, (numbers.Integral, np.integer)):
            max_samples = self.max_samples
        else:  # float
            max_samples = int(self.max_samples * X.shape[0])

        if not (0 < max_samples <= X.shape[0]):
            raise ValueError("max_samples must be in (0, n_samples]")

        if isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        # Free allocated memory, if any
        self.estimators_ = None

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
                                                             self.n_jobs)
        seeds = random_state.randint(MAX_INT, size=self.n_estimators)

        all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_build_estimators)(
                n_estimators[i],
                self,
                X,
                y,
                cost_mat,
                seeds[starts[i]:starts[i + 1]],
                verbose=self.verbose)
            for i in range(n_jobs))

        # Reduce
        self.estimators_ = list(itertools.chain.from_iterable(
            t[0] for t in all_results))
        self.estimators_samples_ = list(itertools.chain.from_iterable(
            t[1] for t in all_results))
        self.estimators_features_ = list(itertools.chain.from_iterable(
            t[2] for t in all_results))

        self._evaluate_oob_savings(X, y, cost_mat)

        if self.combination in ['stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr']:
            self._fit_stacking_model(X, y, cost_mat)

        if self.combination in ['majority_bmr', 'weighted_bmr', 'stacking_bmr', 'stacking_proba_bmr']:
            self._fit_bmr_model(X, y)

        return self

    def _fit_bmr_model(self, X, y):
        """Private function used to fit the BayesMinimumRisk model."""
        self.f_bmr = BayesMinimumRiskClassifier()
        X_bmr = self.predict_proba(X)
        self.f_bmr.fit(y, X_bmr)
        return self

    def _fit_stacking_model(self,X, y, cost_mat, max_iter=100):
        """Private function used to fit the stacking model."""
        self.f_staking = CostSensitiveLogisticRegression(verbose=self.verbose, max_iter=max_iter)
        X_stacking = _create_stacking_set(self.estimators_, self.estimators_features_,
                                          self.estimators_weight_, X, self.combination)
        self.f_staking.fit(X_stacking, y, cost_mat)
        return self

    #TODO: _evaluate_oob_savings in parallel
    def _evaluate_oob_savings(self, X, y, cost_mat):
        """Private function used to calculate the OOB Savings of each estimator."""
        estimators_weight = []
        for estimator, samples, features in zip(self.estimators_, self.estimators_samples_,
                                                self.estimators_features_):
            # Test if all examples where used for training
            if not np.any(~samples):
                # Then use training
                oob_pred = estimator.predict(X[:, features])
                oob_savings = max(0, savings_score(y, oob_pred, cost_mat))
            else:
                # Then use OOB
                oob_pred = estimator.predict((X[~samples])[:, features])
                oob_savings = max(0, savings_score(y[~samples], oob_pred, cost_mat[~samples]))

            estimators_weight.append(oob_savings)

        # Control in case were all weights are 0
        if sum(estimators_weight) == 0:
            self.estimators_weight_ = np.ones(len(estimators_weight)) / len(estimators_weight)
        else:
            self.estimators_weight_ = (np.array(estimators_weight) / sum(estimators_weight)).tolist()

        return self

    def _validate_y(self, y):
        # Default implementation
        return column_or_1d(y, warn=True)
print('The auc_score of RandomForest is {:.2f}'.format(metrics.auc(fpr, tpr)))
print('*' * 90)

y_prob_test = RandomForestClassifier(random_state=0).fit(
    X_train, y_train).predict_proba(X_test)

f_bmr = BayesMinimumRiskClassifier(calibration=True)
f_bmr.fit(y_test, y_prob_test)
y_pred_test_bmr = f_bmr.predict(y_prob_test, cost_mat_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_bmr)
print(
    'The auc_score of using RandomForest and BayesMinimumRiskClassifieris{:.2f}'
    .format(metrics.auc(fpr, tpr)))
print('*' * 90)

f = CostSensitiveLogisticRegression(solver='ga')
f.fit(X_train, y_train, cost_mat_train)
y_pred_test_cslr = f.predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_lr)
print('The auc_score of CostSensitiveLogisticRegression is {:.2f}'.format(
    metrics.auc(fpr, tpr)))
print('*' * 90)

f = CostSensitiveDecisionTreeClassifier()
f.fit(X_train, y_train, cost_mat_train)
y_pred_test_csdt = f.fit(X_train, y_train, cost_mat_train).predict(X_test)
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_test_csdt)
print('The auc_score of using CostSensitiveDecisionTreeClassifier is {:.2f}'.
      format(metrics.auc(fpr, tpr)))
print('*' * 90)
print('The auc_score of using only RandomForest is {:.2f}'.format(metrics.auc(fpr,tpr)))
fpr,tpr,threshold=metrics.roc_curve(y_test,y_pred_test_rf_t)
print('The auc_score of using RandomForest and ThresholdingOptimization is {:.2f}'.format(metrics.auc(fpr,tpr)))


#CostSensitiveLogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from costcla.datasets import load_creditscoring2
from costcla.models import CostSensitiveLogisticRegression
from costcla.metrics import savings_score
data = load_creditscoring2()
sets = train_test_split(data.data, data.target, data.cost_mat, test_size=0.33, random_state=44)
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = sets
y_pred_test_lr = LogisticRegression(solver='lbfgs').fit(X_train, y_train).predict(X_test)
f = CostSensitiveLogisticRegression(solver='ga')
f.fit(X_train, y_train, cost_mat_train)
y_pred_test_cslr = f.predict(X_test)
fpr,tpr,threshold=metrics.roc_curve(y_test,y_pred_test_lr)
print('The auc_score of using only LogisticRegression is {:.2f}'.format(metrics.auc(fpr,tpr)))
fpr,tpr,threshold=metrics.roc_curve(y_test,y_pred_test_cslr)
print('The auc_score of using CostSensitiveLogisticRegression is {:.2f}'.format(metrics.auc(fpr,tpr)))


#CostSensitiveDecisionTreeClassifier
#example-dependent
'''
1.构建树的过程中,cost-sensetive主要作用于impurity,Ic(S) = min(Cost(f0(S)), Cost(f1(S))),将其代入信息增益Gain(xj,lj)中,以cost-sensetive的方法来选择最优特征
2.修剪枝的过程,计算删除一个节点后代价
3.预测过程也同样使用cost-sensetive,对于每个leaf,模型训练完之后比较其中全预测为0和全为1的代价,谁的代价低就预测为谁
'''
Example #8
0
# calculate the number of each class
num = {}
#for
#num['toxic'] = train['toxic']

loss = []
loss_max = 0
losses = np.zeros((train.shape[0], 1))

for i, j in enumerate(col):
    print('===Fit ' + j)
    num[j] = train[j].sum()
    number = str(num[j])
    print('The # of' + j + ' is ' + number)
    model = CostSensitiveLogisticRegression(C=3)
    num[j] = train[j].sum()
    fn = nrow_train / num[j]
    cost_mat = np.ones((nrow_train, 4))
    #cost_mat[:, 1] = fn
    model.fit(X[:nrow_train], train[j], cost_mat)
    preds[:, i] = model.predict_proba(X[nrow_train:])[:, 1]
    pred_train = model.predict_proba(X[:nrow_train])[:, 1]
    logloss = log_loss(train[j], pred_train)
    print('log loss:', logloss)
    print('Avg_loss:', logloss / num[j])
    loss.append(log_loss(train[j], pred_train))

    # calculate the log loss of each pair

print('mean column-wise log loss:', np.mean(loss))