def dt_log(m, features, label):
    trainDT, testDT = train_test_split(data_load, test_size=0.2, random_state=1)
#     trainDT, cvDT = train_test_split(trainDT, test_size=0.2, random_state=1)

    dt = DecisionTreeClassifier(max_depth=3)
    dt.fit(trainDT[features[:m]], trainDT[label])

    leaf = dt.apply(trainDT[features[:m]])
    leafNode = leaf.reshape(-1, 1)
    
    coder = OneHotEncoder()
    coder.fit(leafNode)

    newFeature = np.c_[
        coder.transform(dt.apply(trainDT[features[:m]]).reshape(-1, 1)).toarray(),
        trainDT[features[m:]]]
    logit = LogisticRegression()
    logit.fit(newFeature[:, 1:], trainDT[label].values.ravel())
    
    testFeature = np.c_[
        coder.transform(dt.apply(testDT[features[:m]]).reshape(-1, 1)).toarray(),
        testDT[features[m:]]]
    y_predprob = logit.predict_proba(testFeature[:, 1:])
    y_pred = np.argmax(y_predprob, axis=1)

    print(confusion_matrix(testDT[label]['retention_status'].values, y_pred))
    print("Accuracy : %.4g" % accuracy_score(testDT[label]['retention_status'].values, y_pred))
    print("AUC Score (Test): %f" % roc_auc_score(testDT[label]['retention_status'].values, y_predprob[:, 1]))    
    
    res = roc_curve(testDT[label], y_predprob[:, 1])
    plot_roc(res)
class TreeDiscretizer:
    """A DecisionTreeClassifier of which the leaf indices are used as cluster indices.
    
    Parameters
    ----------
    X_train : array-like, shape (n_samples, n_features), floats
        The feature matrix of the training set.
    
    y_train : array-like, shape (n_samples), nonnegative ints
        The labels of X_train.
    
    J : int
        The maximum number of desired clusters.
    
    criterion : string, optional
        The split criterion of the underlying DecisionTreeClassifier, 'gini' by default.
    
    seed : nonnegative int, optional
        This seed for the random number generator makes random splits reproducible.
    """
    def __init__(self, X_train, y_train, J, criterion='gini', seed=None):
        self.classifier = DecisionTreeClassifier(max_leaf_nodes=J,
                                                 criterion=criterion,
                                                 random_state=seed)
        self.classifier.fit(X_train, y_train)
        x_train = self.classifier.apply(X_train)
        self.indexmap = dict(
            zip(np.unique(x_train), range(len(np.unique(x_train)))))

    def discretize(self, X):
        x_raw = self.classifier.apply(X)  # return the raw leaf indices of X
        return np.array([self.indexmap[x] for x in x_raw])
Beispiel #3
0
def compute_new_features(X_train,y_train,X_test,y_test):
    classifier = DecisionTreeClassifier(max_leaf_nodes=50)
    classifier.fit(X_train,y_train)
    idx_train = classifier.apply(X_train)
    idx_train = idx_train.reshape([-1,1])
    enc = OneHotEncoder()
    enc.fit(idx_train)
    new_features_train = enc.transform(idx_train).toarray()
    
    idx_test = classifier.apply(X_test)
    idx_test = idx_test.reshape([-1,1])
    new_features_test = enc.transform(idx_test).toarray()

    return ([np.hstack([X_train,new_features_train]), np.hstack([X_test,new_features_test])])
Beispiel #4
0
class DecisionTreeCalibration(BaseEstimator, TransformerMixin):
    """
    Выполнение калибровки решеающим деревом и линейными моделями в листах
    """
    def __init__(self, model, tree_max_depth=3, rs=17):
        self.model = model
        self.rs = rs
        self.dt_calib = DecisionTreeClassifier(max_depth=tree_max_depth,
                                               random_state=rs)
        self.logits = {}

    def fit(self, X: pd.DataFrame, y: pd.Series):

        # Обучить дерево решений
        self.dt_calib.fit(X[self.model.used_features], y)
        leafs = self.dt_calib.apply(X[self.model.used_features])

        # Обучить логистическую регрессию для каждого листа
        for leaf in np.unique(leafs):
            lr = LogisticRegression(random_state=self.rs)

            X_sub = X[leafs == leaf]
            y_pred_sub = self.model.transform(X_sub)
            y_sub = y[leafs == leaf]

            lr.fit(y_pred_sub.reshape(-1, 1), y_sub)
            self.logits[leaf] = lr

    def transform(self, X: pd.DataFrame):

        pred_df = pd.DataFrame(
            {
                "y_pred": self.model.transform(X),
                "leaf": self.dt_calib.apply(X[self.model.used_features])
            },
            index=X.index)

        y_calib = pd.Series()

        # для каждого листа применить свой логит
        for lf in np.unique(pred_df.leaf):
            idx_sub = pred_df[pred_df.leaf == lf].index
            y_pred_sub = np.array(pred_df[pred_df.leaf == lf].y_pred).reshape(
                -1, 1)

            y_calib_sub = pd.Series(
                self.logits[lf].predict_proba(y_pred_sub)[:, 1], index=idx_sub)

            y_calib = y_calib.append(y_calib_sub)
        return y_calib
Beispiel #5
0
class TreeClassificationTransformer(BaseTransformer):
    """
    A class used to transform data from a category to a specialized representation.

    Parameters
    ----------
    kwargs : dict, default={}
        A dictionary to contain parameters of the tree.

    Attributes
    ----------
    transformer : sklearn.tree.DecisionTreeClassifier
        an internal sklearn DecisionTreeClassifier
    """

    def __init__(self, kwargs={}):
        self.kwargs = kwargs

    def fit(self, X, y):
        """
        Fits the transformer to data X with labels y.

        Parameters
        ----------
        X : ndarray
            Input data matrix.
        y : ndarray
            Output (i.e. response data matrix).

        Returns
        -------
        self : TreeClassificationTransformer
            The object itself.
        """
        X, y = check_X_y(X, y)
        self.transformer_ = DecisionTreeClassifier(**self.kwargs).fit(X, y)
        return self

    def transform(self, X):
        """
        Performs inference using the transformer.

        Parameters
        ----------
        X : ndarray
            Input data matrix.

        Returns
        -------
        X_transformed : ndarray
            The transformed input.

        Raises
        ------
        NotFittedError
            When the model is not fitted.
        """
        check_is_fitted(self)
        X = check_array(X)
        return self.transformer_.apply(X)
Beispiel #6
0
class DecisionTreeModel:
    # initialize a DecisionTreeModel object with "model" attribute containing an actual DecisionTreeClassifier object from the skLearn module
    def __init__(self,*args,**kwargs):
        self.model = DecisionTreeClassifier(*args, **kwargs)

    def get_model(self):
        return self.model

    def apply(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.apply(X,check_input)

    def cost_complexity_pruning_path(self,X,y,sample_weight=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        return self.model.cost_complexity_pruning_path(X,y,sample_weight)        
    def decision_path(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.decision_path(X,check_input)
    
    def fit(self,X,y,sample_weight=None,check_input=True,X_idx_sorted=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        self.model.fit(X,y,sample_weight,check_input,X_idx_sorted)
        return self

    def predict(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict(X,check_input)

    def predict_log_proba(self,X):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict_log_proba(X)

    def predict_proba(self,X,check_input=True):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        return self.model.predict_proba(X,check_input)

    def score(self,X,y,sample_weight=None):
        if (isinstance(X,TabularData)):
            X=DataConversion.extract_X(X)
        if (isinstance(y,TabularData)):
            y=DataConversion.extract_y(y)
        return self.model.score(X,y,sample_weight)

    def __getattribute__(self,item):
        try:
            return super().__getattribute__(item)
        except:
            pass;
        return getattr(self.model,item)
    def findLeafDepthWithoutPrunning(self, df, classes):

        values = np.empty([0, 0])

        estimator = DecisionTreeClassifier()
        estimator.fit(df, classes)

        n_nodes = estimator.tree_.node_count
        # print(n_nodes)
        children_left = estimator.tree_.children_left
        children_right = estimator.tree_.children_right

        node_depth = np.zeros(shape=n_nodes, dtype=np.int64)

        stack = [(0, -1)]  # seed is the root node id and its parent depth
        while len(stack) > 0:
            node_id, parent_depth = stack.pop()
            node_depth[node_id] = parent_depth + 1

            # If we have a test node
            if children_left[node_id] != children_right[node_id]:
                stack.append((children_left[node_id], parent_depth + 1))
                stack.append((children_right[node_id], parent_depth + 1))

        leafsIndexes = estimator.apply(df, check_input=True)
        for index in leafsIndexes:
            values = np.append(values, np.full((1, 1), node_depth[index]))

        return values
Beispiel #8
0
    def fit_function(self, comb, model):
        """ 
        Fits a single logistic regression or decision tree model

        comb: feature combination to fit model over
        model: fit logistic regression or a decision tree
        """
        X_val = self.X_val.iloc[:, list(comb)]
        if np.shape(X_val)[0] == 1:
            X_val = X_val.reshape(-1, 1)

        # fit decision tree or logistic regression or knn
        if model == 'dt':
            dt = DecisionTreeClassifier(max_depth=len(comb))
            dt.fit(X_val, self.Y_val)

            # caculate val_acc
            Y_pred = dt.apply(self.X_val.iloc[:, list(comb)])
            from sklearn.metrics import accuracy_score
            #  print("va - ", comb, "\t", accuracy_score(self.Y_val, Y_pred))

            return dt

        elif model == 'lr':
            lr = LogisticRegression(multi_class='auto', n_jobs=self.n_jobs)
            lr.fit(X_val, self.Y_val)
            return lr

        elif model == 'nn':
            nn = KNeighborsClassifier(algorithm='kd_tree', n_jobs=self.n_jobs)
            nn.fit(X_val, self.Y_val)
            return nn
Beispiel #9
0
def get_decision_paths(model: tree.DecisionTreeClassifier, data, selection):
    selected_rows = data.loc[selection.astype(bool), :]

    d_path = model.decision_path(selected_rows)
    paths = set()

    leaf_id = model.apply(selected_rows)
    feature = model.tree_.feature
    threshold = model.tree_.threshold

    for sample_id in range(len(selected_rows.index)):
        node_idx = d_path.indices[d_path.indptr[sample_id]:d_path.
                                  indptr[sample_id + 1]]

        rules = []

        for node_id in node_idx:
            if leaf_id[sample_id] == node_id:
                continue

            sign = None
            if selected_rows.iloc[sample_id,
                                  feature[node_id]] <= threshold[node_id]:
                sign = " <= "
            else:
                sign = " >= "

            rule = (data.columns[feature[node_id]] + sign +
                    str(round(threshold[node_id], 2)))
            rules.append(rule)
        paths.add(tuple(rules))

    paths = [[rule for rule in path] for path in paths]
    return paths
Beispiel #10
0
def get_leaf(train_x, train_y, val_x):
    from sklearn.tree import DecisionTreeClassifier
    train_x, train_y, val_x = map(np.array, [train_x, train_y, val_x])
    train_x = train_x.reshape(-1, 1)
    train_y = train_y.reshape(-1, 1)
    val_x = val_x.reshape(-1, 1)
    m = DecisionTreeClassifier(min_samples_leaf=0.001, max_leaf_nodes=25)
    m.fit(train_x, train_y)
    return m.apply(val_x)
Beispiel #11
0
    def fit_preproc(self, x_train, y_train):
        tree_list = []
        ohe_list = []
        var_names = []

        n_predictors = self.n_predictors if self.n_predictors != None else x_train.shape[
            1]
        self.n_predictors = n_predictors  # number of trees

        if isinstance(x_train, pd.DataFrame):
            self.initial_var_names = x_train.columns

        for i in range(n_predictors):
            for j in range(i + 1, n_predictors):
                # Input data with 2 variables (column)
                # print(i,j)
                x_input = x_train.iloc[:, [i, j]]
                tree_t = DecisionTreeClassifier(max_depth=2,
                                                max_leaf_nodes=3).fit(
                                                    x_input, y_train)
                # Output of tree data
                x_processed = tree_t.apply(x_input)
                # onehot encoder transform
                ohe = OneHotEncoder().fit(x_processed.reshape((-1, 1)))
                # Setting var names : for understanding

                c_names = [str(i), str(j)]
                categ = ohe.categories_[0]
                feature = tree_t.tree_.feature
                feature_used = feature[feature >= 0]
                if len(feature_used) > 1:
                    one = feature_used[0]
                    two = feature_used[1]
                    var = [
                        c_names[one],
                        '(' + c_names[one] + ',' + c_names[two] + ')'
                    ]  # First name is the variable used in root and the secode one is the couple of variables used
                elif len(feature_used) == 1:
                    # print(i,j)
                    var = [c_names[feature_used[0]]]
                else:  # variables are useless (both)
                    var = ['ij']
                # Record trees and encoder
                tree_list.append(tree_t)
                ohe_list.append(ohe)
                var_names.append(var)

        self.tree_list = tree_list
        self.ohe_list = ohe_list
        self.var_names = var_names

        return self
Beispiel #12
0
def binning_opt(X, y, depth):

    var_name = X.name
    df = X.to_frame()
    target_df = y.to_frame()
    missing_df = df[df[var_name].isnull()]

    df = df.merge(missing_df,
                  left_index=True,
                  right_index=True,
                  how='outer',
                  indicator=True)
    df = df[df['_merge'] == 'left_only']
    df.drop([var_name + '_y', '_merge'], axis=1, inplace=True)
    df.rename(columns={var_name + '_x': var_name}, inplace=True)

    target_df = target_df.merge(missing_df,
                                left_index=True,
                                right_index=True,
                                how='outer',
                                indicator=True)
    target_df = target_df[target_df['_merge'] == 'left_only']
    target_df.drop([var_name, '_merge'], axis=1, inplace=True)
    dt = DecisionTreeClassifier(max_features=1,
                                max_depth=depth,
                                min_samples_leaf=0.1)
    dt.fit(df, target_df)
    df['nodo'] = dt.apply(df)
    df['nodo'] = df['nodo'].astype(str)
    bins = df.groupby('nodo').agg(['min', 'max'])[var_name]
    bins.sort_values('min', inplace=True)
    bins['min2'] = bins['max'].shift(1)
    bins['min'] = np.where(bins['min2'].isnull(), bins['min'], bins['min2'])
    bins.reset_index(inplace=True)
    bins['id'] = (bins.index +
                  1).map(lambda x: '0' + str(x) if x < 10 else str(x))
    bins['C_' + var_name] = bins['id'] + '. (' + bins['min'].astype(
        str) + ", " + bins['max'].astype(str) + "]"
    bins = bins[['nodo', 'C_' + var_name]]
    df['in'] = df.index
    df = df.merge(bins, how='inner', on='nodo')
    df.index = df['in']
    df.sort_index(inplace=True)
    df.drop(['in', 'nodo'], axis=1, inplace=True)

    missing_df['C_' + var_name] = 'Null'
    missing_df['in'] = missing_df.index
    missing_df.index = missing_df['in']
    missing_df.drop('in', axis=1, inplace=True)
    df = pd.concat([df, missing_df]).sort_index()
    df.index.name = None
    return df['C_' + var_name]
def trainModel(data, features, label):
    """
    分别使用「逻辑回归」、「决策树」、「逻辑回归+决策树」建模
    :param data:
    :return:
    """
    res = {}
    trainData, testData = train_test_split(data, test_size=0.5)

    # 单独使用逻辑回归
    logitModel = LogisticRegression()
    logitModel.fit(trainData[features], trainData[label])
    logitProb = logitModel.predict_proba(testData[features])[:, 1]
    res["logit"] = roc_curve(testData[label], logitProb)
    # 单独使用决策树
    dtModel = DecisionTreeClassifier(max_depth=2)
    dtModel.fit(trainData[features], trainData[label])
    dtProb = dtModel.predict_proba(testData[features])[:, 1]
    res["DT"] = roc_curve(testData[label], dtProb)

    trainDT, trainLR = train_test_split(trainData, test_size=0.5)

    m = 2
    _dt = DecisionTreeClassifier(max_depth=2)
    _dt.fit(trainDT[features[:m]], trainDT[label])
    leafNode = _dt.apply(trainDT[features[: m]]).reshape(-1, 1)
    coder = OneHotEncoder()
    coder.fit(leafNode)
    newFeature = np.c_[
        coder.transform(_dt.apply(trainLR[features[:m]]).reshape(-1, 1)).toarray(),
        trainLR[features[m:]]]
    _logit = LogisticRegression()
    _logit.fit(newFeature[:, 1:], trainLR[label])
    testFeature = np.c_[
        coder.transform(_dt.apply(testData[features[:m]]).reshape(-1, 1)).toarray(),
        testData[features[m:]]]
    dtLogitProb = _logit.predict_proba(testFeature[:, 1:])[:, 1]
    res["DT + logit"] = roc_curve(testData[label], dtLogitProb)
    return res
Beispiel #14
0
    def extract_table(cls, N: np.ndarray, y: np.ndarray,
                      model: DecisionTreeClassifier) -> np.ndarray:
        """Precompute ``model``, ``table`` and ``tree_depth``.

        Parameters
        ----------
        N : :obj:`np.ndarray`
            Attributes from fitted data.

        y : :obj:`np.ndarray`
            Target attribute from fitted data.

        random_state : :obj:`int`, optional
            If int, random_state is the seed used by the random number
            generator; If RandomState instance, random_state is the random
            number generator; If None, the random number generator is the
            RandomState instance used by np.random.

        kwargs:
            Additional arguments. May have previously precomputed before this
            method from other precomputed methods, so they can help speed up
            this precomputation.

        Returns
        -------
        :obj:`np.ndarray`
            Tree property table.
                - Each line represents a node.
                - Column 0: It is the id of the attributed splited in that
                  node.
                - Column 1: It is 1 if the node is a leaf node, otherwise 0.
                - Columns 2: It is the number of examples that fall on that
                  node.
                - Columns 3: It is 0 if the node is not a leaf, otherwise is
                  the class number represented by that leaf node.
        """
        table = np.zeros((model.tree_.node_count, 4))  # type: np.ndarray
        table[:, 0] = model.tree_.feature
        table[:, 2] = model.tree_.n_node_samples

        leaves = model.apply(N)  # type: DecisionTreeClassifier
        if not isinstance(y, np.number):
            _, y = np.unique(y, return_inverse=True)
        tmp = np.array([leaves, y + 1])  # type: np.ndarray

        x = 0  # type: int
        for x in set(leaves):
            table[x, 3] = list(Counter(tmp[1, tmp[0, :] == x]).keys())[0] + 1
            table[x, 1] = 1

        return table
def beginWork(home, logger):
    constant = CONSTANT(home)
    logger.info("========== get subgroup ==========".center(CONSTANT.logLength, "="))
    data, featureName = loadData(constant.getDataFilteredPath(), logger)
    X, Y = data[:, :-1], data[:, -1]
    size = X.shape[0]

    subgroupSizes = [4, 8, 16]
    cartModelDir = constant.getCartModelDir()
    shutil.rmtree(cartModelDir)
    os.makedirs(cartModelDir)
    for subgroupSize in subgroupSizes:
        curCartModelDir = os.path.join(cartModelDir, "subgroup-"+str(subgroupSize))
        os.makedirs(curCartModelDir)
        dotFilePath = os.path.join(curCartModelDir, "dot.dot")
        paramsTxtPath = os.path.join(curCartModelDir, "params.txt")
        subgroupsSavedPath = os.path.join(curCartModelDir, "subgroups.pkl")
        
        subgroups = {}
        avgSize = size // subgroupSize
        param = {
            "criterion": "gini",
            "min_samples_leaf": avgSize*2 // 3,
            "min_samples_split": avgSize*2
        }
        clf = DecisionTreeClassifier(**param).fit(X, Y)

        # 保存树结构&超参数
        with open(dotFilePath, "w", encoding="utf-8") as file:
            file = export_graphviz(clf, out_file = file, feature_names = featureName, 
                    filled = True, rounded = True, special_characters = True)
        logger.info("cart structure saved in {}".format(dotFilePath))
        with open(paramsTxtPath, "w", encoding='utf-8') as file:
            file.write(str(clf.get_params()))
        logger.info("cart params saved in {}".format(paramsTxtPath))
        logger.info(param)
                            
        # 获得并保存亚组
        itemIndex = clf.apply(X)
        for sampleIndex, groupIndex in enumerate(itemIndex):
            if subgroups.get(groupIndex) is None:
                subgroups[groupIndex] = []
            subgroups[groupIndex].append(sampleIndex)
        with open(subgroupsSavedPath, "wb") as file:
            pickle.dump(subgroups, file)
        logger.info("the number of subgroup:{}".format(len(subgroups)))
        logger.info("subgroup data saved in {}".format(subgroupsSavedPath))
        logger.info("{0}{1}".format("subgroup index".center(20), "subgroup size".format(20)))
        for subgroupName, subgroup in subgroups.items():
            logger.info("{0}{1}".format(str(subgroupName).center(20), str(len(subgroup)).center(20)))
    logger.info("==================================".center(CONSTANT.logLength, "="))
Beispiel #16
0
 def treeBinning(self, trainN, i=6, column="column", cutPoint=0.0):
     tree = DecisionTreeClassifier(max_leaf_nodes=i,
                                   min_samples_leaf=np.int(
                                       np.rint(trainN.shape[0] * 0.06)))
     X_select = pd.DataFrame(
         trainN[trainN[column] > cutPoint][column]
     )  # this filter assumes all values < 0 as special and bins separately
     tree.fit(X_select, self.y_train[X_select.index])
     X_select["Node"] = tree.apply(X_select)
     X_select = X_select.append(
         pd.DataFrame({
             column: trainN[trainN[column] <= cutPoint][column],
             "Node": -1
         }))
     X_select = pd.concat([X_select, self.y_train], axis=1)
     test = pd.concat([
         X_select.pivot_table(index="Node",
                              values=column,
                              aggfunc=[np.min, np.max],
                              margins=True),
         X_select.pivot_table(index="Node",
                              values=self.target,
                              aggfunc=[np.sum, len],
                              margins=True)
     ],
                      axis=1)
     test.rename(columns={"sum": "badCnt", "len": "totalCnt"}, inplace=True)
     test["goodCnt"] = test["totalCnt"] - test["badCnt"]
     test["popPercentage"] = test["totalCnt"] / test.loc["All",
                                                         "totalCnt"] * 100
     test["badRate"] = test["badCnt"] / test["totalCnt"] * 100
     test.columns = [
         "amin", "amax", "badCnt", "totalCnt", "goodCnt", "popPercentage",
         "badRate"
     ]
     test = test.sort_values(by="amax")
     test["badSign"] = np.sign(test.badRate - test.badRate.shift(-1))
     test.iloc[test.shape[0] - 2, 7] = np.nan
     test["badDistribution"] = test["badCnt"] / test.loc["All", "badCnt"]
     test["goodDistribution"] = test["goodCnt"] / test.loc["All", "goodCnt"]
     test["distributedGoodBad"] = test["goodDistribution"] - test[
         "badDistribution"]
     test["WOE"] = np.log(test["goodDistribution"] /
                          test["badDistribution"])
     test["IV"] = test["WOE"] * test["distributedGoodBad"] * 100
     test.loc["All", "IV"] = np.sum(test["IV"])
     test["column"] = column
     return test[[
         "amin", "amax", "popPercentage", "IV", "badRate", "badSign",
         "column"
     ]]
def train_model(data, features, label):
    """
    分别使用逻辑回归、决策树和决策树+逻辑回归建模
    """
    res = {}
    train_data, test_data = train_test_split(data, test_size=0.5)
    # 单独使用逻辑回归
    logit_model = LogisticRegression()
    logit_model.fit(train_data[features], train_data[label])
    logit_prob = logit_model.predict_proba(test_data[features])[:, 1]
    res["logit"] = roc_curve(test_data[label], logit_prob)
    # 单独使用决策树
    dt_model = DecisionTreeClassifier(max_depth=2)
    dt_model.fit(train_data[features], train_data[label])
    dt_prob = dt_model.predict_proba(test_data[features])[:, 1]
    res["DT"] = roc_curve(test_data[label], dt_prob)
    # 决策树和逻辑回归联结
    # 为了防止过拟合,使用不同的数据训练决策树和逻辑回归
    train_DT, train_LR = train_test_split(train_data, test_size=0.5)
    # 使用决策树对前两个变量做变换
    m = 2
    _dt = DecisionTreeClassifier(max_depth=2)
    _dt.fit(train_DT[features[:m]], train_DT[label])
    leaf_node = _dt.apply(train_DT[features[:m]]).reshape(-1, 1)
    coder = OneHotEncoder()
    coder.fit(leaf_node)
    new_feature = np.c_[coder.transform(
        _dt.apply(train_LR[features[:m]]).reshape(-1, 1)).toarray(),
                        train_LR[features[m:]]]
    _logit = LogisticRegression()
    _logit.fit(new_feature[:, 1:], train_LR[label])
    test_feature = np.c_[coder.transform(
        _dt.apply(test_data[features[:m]]).reshape(-1, 1)).toarray(),
                         test_data[features[m:]]]
    dt_logit_prob = _logit.predict_proba(test_feature[:, 1:])[:, 1]
    res["DT + logit"] = roc_curve(test_data[label], dt_logit_prob)
    return res
class TreeClassificationTransformer(BaseTransformer):
    def __init__(self, kwargs={}):
        """
        Doc strings here.
        """

        self.kwargs = kwargs

        self._is_fitted = False

    def fit(self, X, y):
        """
        Doc strings here.
        """

        X, y = check_X_y(X, y)

        # define the ensemble
        self.transformer = DecisionTreeClassifier(**self.kwargs).fit(X, y)

        self._is_fitted = True

        return self

    def transform(self, X):
        """
        Doc strings here.
        """

        if not self.is_fitted():
            msg = (
                "This %(name)s instance is not fitted yet. Call 'fit' with "
                "appropriate arguments before using this transformer."
            )
            raise NotFittedError(msg % {"name": type(self).__name__})

        X = check_array(X)
        return self.transformer.apply(X)

    def is_fitted(self):
        """
        Doc strings here.
        """

        return self._is_fitted
Beispiel #19
0
    def countDCP(self, df, classes, minimum_impurity_split):

        values = np.empty([0, 0])

        estimator = DecisionTreeClassifier(min_impurity_split=minimum_impurity_split)
        estimator.fit(df, classes)
        leafsIndexes = estimator.apply(df, check_input=True)

        for index, _ in df.iterrows():
            suma = 0
            value = 0
            for leafIndex, _ in df.iterrows():
                if leafsIndexes[index] == leafsIndexes[leafIndex]:
                    suma += 1
                    if classes[index] == classes[leafIndex]:
                        value += 1
            values = np.append(values, np.full((1, 1), (value / suma) * -1))
            # print("Count DCP for " + repr(index) + ". row of data.")

        return values
Beispiel #20
0
def learnTrees_and_return_segments(depth):
    global dt
    global features
    global targets
    features = list(df.columns)
    target_feature = features[-1]
    features = list(features[:len(features)-1])
    targets = df[target_feature].unique()
    print 'targets:', targets
    print 'features:', features
    y=df[target_feature]
    X=df[features]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.0,random_state=0)
    #dt = DecisionTreeRegressor(max_depth=depth)#, min_samples_split=20, random_state=99)
    dt = DecisionTreeClassifier(max_depth=depth)#, min_samples_split=20, random_state=99)
    dt.fit(X_train,y_train)
    prediction = dt.predict(X_train)
    print 'accuracy:', accuracy_score(y_train,prediction)
    print 'recall:', recall_score(y_train,prediction,average=None)
    print 'classification report:'
    print classification_report(y_train,prediction)
    print 'num correct:', accuracy_score(y_train,prediction) * len(y_train)
    print 'num incorrect:', (1-accuracy_score(y_train,prediction)) * len(y_train)
    print 'R2 Score:', r2_score(y_train,prediction)
    print 'absolute error:', mean_absolute_error(y_train,prediction)*len(X_train)
    
    app = dt.apply(X)
    
    uni=np.unique(app)
    
    segments_set=[[[]] for i in uni]
    
    for i in range(len(app)):
        
        index=int(np.where(uni==app[i])[0])
        
        segments_set[index][0].append(i)
        
    #segments_set=[copy.copy(segments_set) for i in uni]
        
    return targets, segments_set, mean_absolute_error(y_train,prediction)*len(X_train)
Beispiel #21
0
 def fit(X0, W0, X1, W1, **kwargs):
     X = np.concatenate([as_features(X0), as_features(X1)])
     Y = np.array([0] * W0.size + [1] * W1.size)
     W = np.concatenate([W0, W1])
     T = DecisionTreeClassifier(class_weight="balanced", **kwargs)
     T.fit(X, Y, sample_weight=W)
     _, *shape = X0.shape
     tree = T.tree_
     feature = [
         np.unravel_index(f, shape) if f >= 0 else None
         for f in tree.feature
     ]
     leaf = T.apply(X)
     pred = np.empty(tree.node_count)
     for n in range(tree.node_count):
         mask = leaf == n
         w0 = (W * mask * (Y == 0)).sum() + 1e-3
         w1 = (W * mask * (Y == 1)).sum() + 1e-3
         pred[n] = np.log(w1 / w0) / 2
     return DTree(feature, tree.threshold, tree.children_left,
                  tree.children_right, pred)
    def countDS(self, df, classes):

        values = np.empty([0, 0])

        estimator = DecisionTreeClassifier()
        estimator.fit(df, classes)
        leafsIndexes = estimator.apply(df, check_input=True)
        leafs = np.zeros(estimator.tree_.node_count)
        # count number of instances for every leaf
        for leafIndex in leafsIndexes:
            leafs[leafIndex] += 1

        biggestDisjunct = max(leafs) - 1
        # count fraction for every instance
        for leafIndex in leafsIndexes:
            values = np.append(
                values,
                np.full((1, 1),
                        ((leafs[leafIndex] - 1) / biggestDisjunct) * -1))

        return values
Beispiel #23
0
    def tree_bins_func(self, grps=None, pct_size=None):
        """
        基于决策树(信息熵)的分组
        1.max_grps控制最大分组的个数;
        2.pct_size控制每组最低的样本占比
        """
        tmp = self.raw.copy().dropna()
        if pct_size is None:
            smp_size = np.int(len(tmp) * self.argms['pct_size']) + 1
        else:
            smp_size = np.int(len(tmp) * pct_size) + 1
        if grps is None:
            grps = self.argms['max_grps']

        #当特征的最大取值占比超过阈值时,不做进一步区分,只分为2组
        #以决策树为分组的基准工具
        clf = DecisionTreeClassifier(min_samples_leaf=smp_size,
                                     max_leaf_nodes=grps)
        clf.fit(tmp[[self.ft_name]], tmp['label'])

        tmp['grp_prd'] = clf.apply(tmp[[self.ft_name]])

        grp_info = tmp.groupby('grp_prd').min()
        grp_info.sort_values(self.ft_name, inplace=True, ascending=True)
        cuts = list(grp_info[self.ft_name]) + [tmp[self.ft_name].max() + 1]

        cuts = self._smpSizeCheck_real(tmp, cuts, smp_size)

        self.bins = {self.ft_name: cuts}
        self.cap_info = {
            'max': tmp[self.ft_name].max(),
            'min': tmp[self.ft_name].min()
        }
        if len(cuts) == 2:
            self.woe_check = {
                self.ft_name: 'tree_bins_func_failed!-value biased'
            }
        else:
            self.woe_check = {}
Beispiel #24
0
def IV(df,
       var,
       target,
       n_levels_to_factor_threshold=5,
       calc_type='Categorical',
       Min_Category_Share=0.05,
       nbins=10):
    # cut numeric features
    if is_numeric(
            df[var]) and len(df[var].unique()) > n_levels_to_factor_threshold:
        if calc_type == 'Interval':
            df = pd.DataFrame({
                var: qcut(df[var], q=nbins, duplicates='drop'),
                target: df[target]
            })

        elif calc_type == 'Categorical':
            tree = DecisionTreeClassifier(
                criterion='entropy',
                min_samples_leaf=int(df.shape[0] * Min_Category_Share),
                presort=True,
                random_state=1223)
            tree.fit(df.loc[~df[var].isna(), [var]], df.loc[~df[var].isna(),
                                                            target])

            tmp = tree.apply(df.loc[~df[var].isna(), [var]]).astype(str)
            tmp = ['leaf_' + e for e in tmp]

            tmp2 = ~df[var].isna()

            df = pd.DataFrame({var: ['NA'] * df.shape[0], target: df[target]})
            df.loc[tmp2, var] = tmp

    # calculate IV
    rez = WoE_full(df, var, target)
    rez = sum(rez['WoE'] * (rez['GR'] - rez['BR']))

    # return result
    return rez
OUT_FOLDER = "data/COMPAS/holdout/recidivism_%s.csv"

df = pd.read_csv("data/COMPAS/recidivism.csv")

X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.165,
                                                    random_state=69)

clf = DecisionTreeClassifier(random_state=0, min_samples_leaf=300)
clf.fit(X, y)

l_train = clf.apply(X_train)
l_test = clf.apply(X_test)
l_indexes = np.unique(l_train)
clusters = []
for l_index in l_indexes:
    c_train_indexes = np.where(l_train == l_index)
    c_test_indexes = np.where(l_test == l_index)
    cluster = (l_index, c_train_indexes[0], c_test_indexes[0])
    clusters.append(cluster)
clusters.sort(key=lambda x: len(x[1]))


def store_data(outfile, X, y):
    df = pd.concat([X, y], axis=1)
    df.to_csv(outfile)
                  feature[i],
                  threshold[i],
                  children_right[i],
              ))
print()
"""
# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.
"""

node_indicator = tree.decision_path(X)

# Similarly, we can also have the leaves ids reached by each sample.
leave_id = tree.apply(X)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

#sample_id = 0
#node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
#                                    node_indicator.indptr[sample_id + 1]]
#
#print('Rules used to predict sample %s: ' % sample_id)
#for node_id in node_index:
#    if leave_id[sample_id] == node_id:
#        continue
#
#    if (X[sample_id, feature[node_id]] <= threshold[node_id]):
#        threshold_sign = "<="
Beispiel #27
0
clf.fit(X_train, y_train)

print("showing prediction results (first 10) [1=infected, 0 = non infected]:")
y_pred = clf.predict(X_test)
print(y_pred[:5])
a = X_test[:1]

print("a:")
print(a)

#print('sk_pred: {}'.format(clf.predict(a)))
#print('true: {}'.format(y_test[:3]))

# shows the end point of the tree traverse by a sample
print("Returns the index of the leaf that each sample is predicted as:")
index_of_leaf = clf.apply(a)
print(index_of_leaf)

#decision path shows the nodes of the tree that were traverse by the sample.
print("decision path:")
d_path = clf.decision_path(a)
print(d_path)

print("nodes in the decision path:")
n_d_path = np.unique(np.sort(d_path.indices))
print(n_d_path)

print("probability of each class:")
print(clf.predict_proba(a))

print("Feature importances:")
class TreeClassificationTransformer(BaseTransformer):
    """
    A class used to transform data from a category to a specialized representation.

    Attributes (object)
    ----------
    kwargs : dict
        A dictionary to contain parameters of the tree.
    _is_fitted_ : bool
        A boolean to identify if the model is currently fitted.

    Methods
    ----------
    fit(X, y)
        Fits the transformer to data X with labels y.
    transform(X)
        Performs inference using the transformer.
    is_fitted()
        Indicates whether the transformer is fitted.
    """
    def __init__(self, kwargs={}):

        self.kwargs = kwargs

        self._is_fitted = False

    def fit(self, X, y):
        """
        Fits the transformer to data X with labels y.

        Parameters
        ----------
        X : ndarray
            Input data matrix.
        y : ndarray
            Output (i.e. response data matrix).
        """

        X, y = check_X_y(X, y)

        # define the ensemble
        self.transformer = DecisionTreeClassifier(**self.kwargs).fit(X, y)

        self._is_fitted = True

        return self

    def transform(self, X):
        """
        Performs inference using the transformer.

        Parameters
        ----------
        X : ndarray
            Input data matrix.
        """

        if not self.is_fitted():
            msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
                   "appropriate arguments before using this transformer.")
            raise NotFittedError(msg % {"name": type(self).__name__})

        X = check_array(X)
        return self.transformer.apply(X)

    def is_fitted(self):
        """
        Indicates whether the transformer is fitted.

        Parameters
        ----------
        None
        """

        return self._is_fitted
# ``decision_path`` method outputs an indicator matrix that allows us to
# retrieve the nodes the samples of interest traverse through. A non zero
# element in the indicator matrix at position ``(i, j)`` indicates that
# the sample ``i`` goes through the node ``j``. Or, for one sample ``i``, the
# positions of the non zero elements in row ``i`` of the indicator matrix
# designate the ids of the nodes that sample goes through.
#
# The leaf ids reached by samples of interest can be obtained with the
# ``apply`` method. This returns an array of the node ids of the leaves
# reached by each sample of interest. Using the leaf ids and the
# ``decision_path`` we can obtain the splitting conditions that were used to
# predict a sample or a group of samples. First, let's do it for one sample.
# Note that ``node_index`` is a sparse matrix.

node_indicator = clf.decision_path(X_test)
leaf_id = clf.apply(X_test)

sample_id = 0
# obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id`
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]

print("Rules used to predict sample {id}:\n".format(id=sample_id))
for node_id in node_index:
    # continue to the next node if it is a leaf node
    if leaf_id[sample_id] == node_id:
        continue

    # check if value of the split feature for sample 0 is below threshold
    if X_test[sample_id, feature[node_id]] <= threshold[node_id]:
Beispiel #30
0
@author: Samruddhi Somani
"""
execfile('Original.py')
execfile('tfidf.py')

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np

#fitting/examining tree
s2=DecisionTreeClassifier(max_depth=3, random_state=5)
s2.fit(x,cuisine)
leaves2=pd.Series(s2.apply(x),name='leaves')
idk=pd.concat([cuisine,leaves2],axis=1)
m=list(leaves2.value_counts().index.values) #[3, 6, 10, 4, 7, 13, 11, 14]
for y in m:
    print y
    print idk[leaves2==y]['cuisine'].value_counts()
leaves2.value_counts()

#==============================================================================
# 3     33684: SGD SVM
# 6      2991: SGD SVM
# 10     1848: Naive Bayes/Logistic Regression
# 4       914: Naive Bayes/Logistic Regression
# 7       300: Logistic Regression
# 13       20: Naive Bayes
# 11       14: Naive Bayes
                  feature[i],
                  threshold[i],
                  children_right[i],
              ))
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[
    node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] != node_id:
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
# -*- coding: utf-8 -*-

execfile('Original.py')
execfile('tfidf.py')

from sklearn.tree import DecisionTreeClassifier

s=DecisionTreeClassifier(max_depth=2, random_state=5)
s.fit(x,cuisine)
leaves=pd.Series(s.apply(x),name='leaves')
idk=pd.concat([cuisine,leaves],axis=1)
m=list(leaves.value_counts().index.values)
for y in m:
    print y
    print idk[leaves==y]['cuisine'].value_counts()
leaves.value_counts()


s2=DecisionTreeClassifier(max_depth=3, random_state=5)
s2.fit(x,cuisine)
leaves2=pd.Series(s2.apply(x),name='leaves')
idk=pd.concat([cuisine,leaves2],axis=1)
m=list(leaves2.value_counts().index.values)
for y in m:
    print y
    print idk[leaves2==y]['cuisine'].value_counts()
leaves2.value_counts()


s3=DecisionTreeClassifier(max_leaf_nodes=8, random_state=5,criterion='entropy')
s3.fit(x,cuisine)
        print(
            "%snode=%s test node: go to node %s if X[:, %s] <= %ss else to "
            "node %s." % (node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i])
        )
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]]

print("Rules used to predict sample %s: " % sample_id)
for node_id in node_index:
    if leave_id[sample_id] != node_id:
        continue

    if X_test[sample_id, feature[node_id]] <= threshold[node_id]:
        threshold_sign = "<="
    else:
Beispiel #34
0
        #divide each column over sum of rows
        df_new=df.div(df.sum(axis=0),axis='columns').fillna(0)
    return df_new

def hmwrapper(cm,filename):
    h=heatmap(cm).get_figure()
    ax=h.add_subplot(111)
    ax.set_xlabel('Predictions')
    h.tight_layout()
    h.set_size_inches(8,5.5)
    h.savefig(filename,bbox_inches='tight',dpi=100)

#fitting/examining tree
s2=DecisionTreeClassifier(max_leaf_nodes=10, min_samples_leaf=500, random_state=5)
s2.fit(x,cuisine)
leaves2=pd.Series(s2.apply(x),name='leaves')
idk=pd.concat([cuisine,leaves2],axis=1)
m=list(leaves2.value_counts().index.values) #[3, 6, 10, 4, 7, 13, 11, 14]
for y in m:
    print y
    print idk[leaves2==y]['cuisine'].value_counts()
leaves2.value_counts()

#==============================================================================
# 3     33684: SGD SVM
# 6      2991: SGD SVM
# 10     1848: Naive Bayes/Logistic Regression
# 4       914: Naive Bayes/Logistic Regression
# 7       300: Logistic Regression
# 13       20: Naive Bayes
# 11       14: Naive Bayes