def dt_log(m, features, label): trainDT, testDT = train_test_split(data_load, test_size=0.2, random_state=1) # trainDT, cvDT = train_test_split(trainDT, test_size=0.2, random_state=1) dt = DecisionTreeClassifier(max_depth=3) dt.fit(trainDT[features[:m]], trainDT[label]) leaf = dt.apply(trainDT[features[:m]]) leafNode = leaf.reshape(-1, 1) coder = OneHotEncoder() coder.fit(leafNode) newFeature = np.c_[ coder.transform(dt.apply(trainDT[features[:m]]).reshape(-1, 1)).toarray(), trainDT[features[m:]]] logit = LogisticRegression() logit.fit(newFeature[:, 1:], trainDT[label].values.ravel()) testFeature = np.c_[ coder.transform(dt.apply(testDT[features[:m]]).reshape(-1, 1)).toarray(), testDT[features[m:]]] y_predprob = logit.predict_proba(testFeature[:, 1:]) y_pred = np.argmax(y_predprob, axis=1) print(confusion_matrix(testDT[label]['retention_status'].values, y_pred)) print("Accuracy : %.4g" % accuracy_score(testDT[label]['retention_status'].values, y_pred)) print("AUC Score (Test): %f" % roc_auc_score(testDT[label]['retention_status'].values, y_predprob[:, 1])) res = roc_curve(testDT[label], y_predprob[:, 1]) plot_roc(res)
class TreeDiscretizer: """A DecisionTreeClassifier of which the leaf indices are used as cluster indices. Parameters ---------- X_train : array-like, shape (n_samples, n_features), floats The feature matrix of the training set. y_train : array-like, shape (n_samples), nonnegative ints The labels of X_train. J : int The maximum number of desired clusters. criterion : string, optional The split criterion of the underlying DecisionTreeClassifier, 'gini' by default. seed : nonnegative int, optional This seed for the random number generator makes random splits reproducible. """ def __init__(self, X_train, y_train, J, criterion='gini', seed=None): self.classifier = DecisionTreeClassifier(max_leaf_nodes=J, criterion=criterion, random_state=seed) self.classifier.fit(X_train, y_train) x_train = self.classifier.apply(X_train) self.indexmap = dict( zip(np.unique(x_train), range(len(np.unique(x_train))))) def discretize(self, X): x_raw = self.classifier.apply(X) # return the raw leaf indices of X return np.array([self.indexmap[x] for x in x_raw])
def compute_new_features(X_train,y_train,X_test,y_test): classifier = DecisionTreeClassifier(max_leaf_nodes=50) classifier.fit(X_train,y_train) idx_train = classifier.apply(X_train) idx_train = idx_train.reshape([-1,1]) enc = OneHotEncoder() enc.fit(idx_train) new_features_train = enc.transform(idx_train).toarray() idx_test = classifier.apply(X_test) idx_test = idx_test.reshape([-1,1]) new_features_test = enc.transform(idx_test).toarray() return ([np.hstack([X_train,new_features_train]), np.hstack([X_test,new_features_test])])
class DecisionTreeCalibration(BaseEstimator, TransformerMixin): """ Выполнение калибровки решеающим деревом и линейными моделями в листах """ def __init__(self, model, tree_max_depth=3, rs=17): self.model = model self.rs = rs self.dt_calib = DecisionTreeClassifier(max_depth=tree_max_depth, random_state=rs) self.logits = {} def fit(self, X: pd.DataFrame, y: pd.Series): # Обучить дерево решений self.dt_calib.fit(X[self.model.used_features], y) leafs = self.dt_calib.apply(X[self.model.used_features]) # Обучить логистическую регрессию для каждого листа for leaf in np.unique(leafs): lr = LogisticRegression(random_state=self.rs) X_sub = X[leafs == leaf] y_pred_sub = self.model.transform(X_sub) y_sub = y[leafs == leaf] lr.fit(y_pred_sub.reshape(-1, 1), y_sub) self.logits[leaf] = lr def transform(self, X: pd.DataFrame): pred_df = pd.DataFrame( { "y_pred": self.model.transform(X), "leaf": self.dt_calib.apply(X[self.model.used_features]) }, index=X.index) y_calib = pd.Series() # для каждого листа применить свой логит for lf in np.unique(pred_df.leaf): idx_sub = pred_df[pred_df.leaf == lf].index y_pred_sub = np.array(pred_df[pred_df.leaf == lf].y_pred).reshape( -1, 1) y_calib_sub = pd.Series( self.logits[lf].predict_proba(y_pred_sub)[:, 1], index=idx_sub) y_calib = y_calib.append(y_calib_sub) return y_calib
class TreeClassificationTransformer(BaseTransformer): """ A class used to transform data from a category to a specialized representation. Parameters ---------- kwargs : dict, default={} A dictionary to contain parameters of the tree. Attributes ---------- transformer : sklearn.tree.DecisionTreeClassifier an internal sklearn DecisionTreeClassifier """ def __init__(self, kwargs={}): self.kwargs = kwargs def fit(self, X, y): """ Fits the transformer to data X with labels y. Parameters ---------- X : ndarray Input data matrix. y : ndarray Output (i.e. response data matrix). Returns ------- self : TreeClassificationTransformer The object itself. """ X, y = check_X_y(X, y) self.transformer_ = DecisionTreeClassifier(**self.kwargs).fit(X, y) return self def transform(self, X): """ Performs inference using the transformer. Parameters ---------- X : ndarray Input data matrix. Returns ------- X_transformed : ndarray The transformed input. Raises ------ NotFittedError When the model is not fitted. """ check_is_fitted(self) X = check_array(X) return self.transformer_.apply(X)
class DecisionTreeModel: # initialize a DecisionTreeModel object with "model" attribute containing an actual DecisionTreeClassifier object from the skLearn module def __init__(self,*args,**kwargs): self.model = DecisionTreeClassifier(*args, **kwargs) def get_model(self): return self.model def apply(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.apply(X,check_input) def cost_complexity_pruning_path(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) return self.model.cost_complexity_pruning_path(X,y,sample_weight) def decision_path(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.decision_path(X,check_input) def fit(self,X,y,sample_weight=None,check_input=True,X_idx_sorted=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) self.model.fit(X,y,sample_weight,check_input,X_idx_sorted) return self def predict(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict(X,check_input) def predict_log_proba(self,X): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict_log_proba(X) def predict_proba(self,X,check_input=True): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) return self.model.predict_proba(X,check_input) def score(self,X,y,sample_weight=None): if (isinstance(X,TabularData)): X=DataConversion.extract_X(X) if (isinstance(y,TabularData)): y=DataConversion.extract_y(y) return self.model.score(X,y,sample_weight) def __getattribute__(self,item): try: return super().__getattribute__(item) except: pass; return getattr(self.model,item)
def findLeafDepthWithoutPrunning(self, df, classes): values = np.empty([0, 0]) estimator = DecisionTreeClassifier() estimator.fit(df, classes) n_nodes = estimator.tree_.node_count # print(n_nodes) children_left = estimator.tree_.children_left children_right = estimator.tree_.children_right node_depth = np.zeros(shape=n_nodes, dtype=np.int64) stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() node_depth[node_id] = parent_depth + 1 # If we have a test node if children_left[node_id] != children_right[node_id]: stack.append((children_left[node_id], parent_depth + 1)) stack.append((children_right[node_id], parent_depth + 1)) leafsIndexes = estimator.apply(df, check_input=True) for index in leafsIndexes: values = np.append(values, np.full((1, 1), node_depth[index])) return values
def fit_function(self, comb, model): """ Fits a single logistic regression or decision tree model comb: feature combination to fit model over model: fit logistic regression or a decision tree """ X_val = self.X_val.iloc[:, list(comb)] if np.shape(X_val)[0] == 1: X_val = X_val.reshape(-1, 1) # fit decision tree or logistic regression or knn if model == 'dt': dt = DecisionTreeClassifier(max_depth=len(comb)) dt.fit(X_val, self.Y_val) # caculate val_acc Y_pred = dt.apply(self.X_val.iloc[:, list(comb)]) from sklearn.metrics import accuracy_score # print("va - ", comb, "\t", accuracy_score(self.Y_val, Y_pred)) return dt elif model == 'lr': lr = LogisticRegression(multi_class='auto', n_jobs=self.n_jobs) lr.fit(X_val, self.Y_val) return lr elif model == 'nn': nn = KNeighborsClassifier(algorithm='kd_tree', n_jobs=self.n_jobs) nn.fit(X_val, self.Y_val) return nn
def get_decision_paths(model: tree.DecisionTreeClassifier, data, selection): selected_rows = data.loc[selection.astype(bool), :] d_path = model.decision_path(selected_rows) paths = set() leaf_id = model.apply(selected_rows) feature = model.tree_.feature threshold = model.tree_.threshold for sample_id in range(len(selected_rows.index)): node_idx = d_path.indices[d_path.indptr[sample_id]:d_path. indptr[sample_id + 1]] rules = [] for node_id in node_idx: if leaf_id[sample_id] == node_id: continue sign = None if selected_rows.iloc[sample_id, feature[node_id]] <= threshold[node_id]: sign = " <= " else: sign = " >= " rule = (data.columns[feature[node_id]] + sign + str(round(threshold[node_id], 2))) rules.append(rule) paths.add(tuple(rules)) paths = [[rule for rule in path] for path in paths] return paths
def get_leaf(train_x, train_y, val_x): from sklearn.tree import DecisionTreeClassifier train_x, train_y, val_x = map(np.array, [train_x, train_y, val_x]) train_x = train_x.reshape(-1, 1) train_y = train_y.reshape(-1, 1) val_x = val_x.reshape(-1, 1) m = DecisionTreeClassifier(min_samples_leaf=0.001, max_leaf_nodes=25) m.fit(train_x, train_y) return m.apply(val_x)
def fit_preproc(self, x_train, y_train): tree_list = [] ohe_list = [] var_names = [] n_predictors = self.n_predictors if self.n_predictors != None else x_train.shape[ 1] self.n_predictors = n_predictors # number of trees if isinstance(x_train, pd.DataFrame): self.initial_var_names = x_train.columns for i in range(n_predictors): for j in range(i + 1, n_predictors): # Input data with 2 variables (column) # print(i,j) x_input = x_train.iloc[:, [i, j]] tree_t = DecisionTreeClassifier(max_depth=2, max_leaf_nodes=3).fit( x_input, y_train) # Output of tree data x_processed = tree_t.apply(x_input) # onehot encoder transform ohe = OneHotEncoder().fit(x_processed.reshape((-1, 1))) # Setting var names : for understanding c_names = [str(i), str(j)] categ = ohe.categories_[0] feature = tree_t.tree_.feature feature_used = feature[feature >= 0] if len(feature_used) > 1: one = feature_used[0] two = feature_used[1] var = [ c_names[one], '(' + c_names[one] + ',' + c_names[two] + ')' ] # First name is the variable used in root and the secode one is the couple of variables used elif len(feature_used) == 1: # print(i,j) var = [c_names[feature_used[0]]] else: # variables are useless (both) var = ['ij'] # Record trees and encoder tree_list.append(tree_t) ohe_list.append(ohe) var_names.append(var) self.tree_list = tree_list self.ohe_list = ohe_list self.var_names = var_names return self
def binning_opt(X, y, depth): var_name = X.name df = X.to_frame() target_df = y.to_frame() missing_df = df[df[var_name].isnull()] df = df.merge(missing_df, left_index=True, right_index=True, how='outer', indicator=True) df = df[df['_merge'] == 'left_only'] df.drop([var_name + '_y', '_merge'], axis=1, inplace=True) df.rename(columns={var_name + '_x': var_name}, inplace=True) target_df = target_df.merge(missing_df, left_index=True, right_index=True, how='outer', indicator=True) target_df = target_df[target_df['_merge'] == 'left_only'] target_df.drop([var_name, '_merge'], axis=1, inplace=True) dt = DecisionTreeClassifier(max_features=1, max_depth=depth, min_samples_leaf=0.1) dt.fit(df, target_df) df['nodo'] = dt.apply(df) df['nodo'] = df['nodo'].astype(str) bins = df.groupby('nodo').agg(['min', 'max'])[var_name] bins.sort_values('min', inplace=True) bins['min2'] = bins['max'].shift(1) bins['min'] = np.where(bins['min2'].isnull(), bins['min'], bins['min2']) bins.reset_index(inplace=True) bins['id'] = (bins.index + 1).map(lambda x: '0' + str(x) if x < 10 else str(x)) bins['C_' + var_name] = bins['id'] + '. (' + bins['min'].astype( str) + ", " + bins['max'].astype(str) + "]" bins = bins[['nodo', 'C_' + var_name]] df['in'] = df.index df = df.merge(bins, how='inner', on='nodo') df.index = df['in'] df.sort_index(inplace=True) df.drop(['in', 'nodo'], axis=1, inplace=True) missing_df['C_' + var_name] = 'Null' missing_df['in'] = missing_df.index missing_df.index = missing_df['in'] missing_df.drop('in', axis=1, inplace=True) df = pd.concat([df, missing_df]).sort_index() df.index.name = None return df['C_' + var_name]
def trainModel(data, features, label): """ 分别使用「逻辑回归」、「决策树」、「逻辑回归+决策树」建模 :param data: :return: """ res = {} trainData, testData = train_test_split(data, test_size=0.5) # 单独使用逻辑回归 logitModel = LogisticRegression() logitModel.fit(trainData[features], trainData[label]) logitProb = logitModel.predict_proba(testData[features])[:, 1] res["logit"] = roc_curve(testData[label], logitProb) # 单独使用决策树 dtModel = DecisionTreeClassifier(max_depth=2) dtModel.fit(trainData[features], trainData[label]) dtProb = dtModel.predict_proba(testData[features])[:, 1] res["DT"] = roc_curve(testData[label], dtProb) trainDT, trainLR = train_test_split(trainData, test_size=0.5) m = 2 _dt = DecisionTreeClassifier(max_depth=2) _dt.fit(trainDT[features[:m]], trainDT[label]) leafNode = _dt.apply(trainDT[features[: m]]).reshape(-1, 1) coder = OneHotEncoder() coder.fit(leafNode) newFeature = np.c_[ coder.transform(_dt.apply(trainLR[features[:m]]).reshape(-1, 1)).toarray(), trainLR[features[m:]]] _logit = LogisticRegression() _logit.fit(newFeature[:, 1:], trainLR[label]) testFeature = np.c_[ coder.transform(_dt.apply(testData[features[:m]]).reshape(-1, 1)).toarray(), testData[features[m:]]] dtLogitProb = _logit.predict_proba(testFeature[:, 1:])[:, 1] res["DT + logit"] = roc_curve(testData[label], dtLogitProb) return res
def extract_table(cls, N: np.ndarray, y: np.ndarray, model: DecisionTreeClassifier) -> np.ndarray: """Precompute ``model``, ``table`` and ``tree_depth``. Parameters ---------- N : :obj:`np.ndarray` Attributes from fitted data. y : :obj:`np.ndarray` Target attribute from fitted data. random_state : :obj:`int`, optional If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. kwargs: Additional arguments. May have previously precomputed before this method from other precomputed methods, so they can help speed up this precomputation. Returns ------- :obj:`np.ndarray` Tree property table. - Each line represents a node. - Column 0: It is the id of the attributed splited in that node. - Column 1: It is 1 if the node is a leaf node, otherwise 0. - Columns 2: It is the number of examples that fall on that node. - Columns 3: It is 0 if the node is not a leaf, otherwise is the class number represented by that leaf node. """ table = np.zeros((model.tree_.node_count, 4)) # type: np.ndarray table[:, 0] = model.tree_.feature table[:, 2] = model.tree_.n_node_samples leaves = model.apply(N) # type: DecisionTreeClassifier if not isinstance(y, np.number): _, y = np.unique(y, return_inverse=True) tmp = np.array([leaves, y + 1]) # type: np.ndarray x = 0 # type: int for x in set(leaves): table[x, 3] = list(Counter(tmp[1, tmp[0, :] == x]).keys())[0] + 1 table[x, 1] = 1 return table
def beginWork(home, logger): constant = CONSTANT(home) logger.info("========== get subgroup ==========".center(CONSTANT.logLength, "=")) data, featureName = loadData(constant.getDataFilteredPath(), logger) X, Y = data[:, :-1], data[:, -1] size = X.shape[0] subgroupSizes = [4, 8, 16] cartModelDir = constant.getCartModelDir() shutil.rmtree(cartModelDir) os.makedirs(cartModelDir) for subgroupSize in subgroupSizes: curCartModelDir = os.path.join(cartModelDir, "subgroup-"+str(subgroupSize)) os.makedirs(curCartModelDir) dotFilePath = os.path.join(curCartModelDir, "dot.dot") paramsTxtPath = os.path.join(curCartModelDir, "params.txt") subgroupsSavedPath = os.path.join(curCartModelDir, "subgroups.pkl") subgroups = {} avgSize = size // subgroupSize param = { "criterion": "gini", "min_samples_leaf": avgSize*2 // 3, "min_samples_split": avgSize*2 } clf = DecisionTreeClassifier(**param).fit(X, Y) # 保存树结构&超参数 with open(dotFilePath, "w", encoding="utf-8") as file: file = export_graphviz(clf, out_file = file, feature_names = featureName, filled = True, rounded = True, special_characters = True) logger.info("cart structure saved in {}".format(dotFilePath)) with open(paramsTxtPath, "w", encoding='utf-8') as file: file.write(str(clf.get_params())) logger.info("cart params saved in {}".format(paramsTxtPath)) logger.info(param) # 获得并保存亚组 itemIndex = clf.apply(X) for sampleIndex, groupIndex in enumerate(itemIndex): if subgroups.get(groupIndex) is None: subgroups[groupIndex] = [] subgroups[groupIndex].append(sampleIndex) with open(subgroupsSavedPath, "wb") as file: pickle.dump(subgroups, file) logger.info("the number of subgroup:{}".format(len(subgroups))) logger.info("subgroup data saved in {}".format(subgroupsSavedPath)) logger.info("{0}{1}".format("subgroup index".center(20), "subgroup size".format(20))) for subgroupName, subgroup in subgroups.items(): logger.info("{0}{1}".format(str(subgroupName).center(20), str(len(subgroup)).center(20))) logger.info("==================================".center(CONSTANT.logLength, "="))
def treeBinning(self, trainN, i=6, column="column", cutPoint=0.0): tree = DecisionTreeClassifier(max_leaf_nodes=i, min_samples_leaf=np.int( np.rint(trainN.shape[0] * 0.06))) X_select = pd.DataFrame( trainN[trainN[column] > cutPoint][column] ) # this filter assumes all values < 0 as special and bins separately tree.fit(X_select, self.y_train[X_select.index]) X_select["Node"] = tree.apply(X_select) X_select = X_select.append( pd.DataFrame({ column: trainN[trainN[column] <= cutPoint][column], "Node": -1 })) X_select = pd.concat([X_select, self.y_train], axis=1) test = pd.concat([ X_select.pivot_table(index="Node", values=column, aggfunc=[np.min, np.max], margins=True), X_select.pivot_table(index="Node", values=self.target, aggfunc=[np.sum, len], margins=True) ], axis=1) test.rename(columns={"sum": "badCnt", "len": "totalCnt"}, inplace=True) test["goodCnt"] = test["totalCnt"] - test["badCnt"] test["popPercentage"] = test["totalCnt"] / test.loc["All", "totalCnt"] * 100 test["badRate"] = test["badCnt"] / test["totalCnt"] * 100 test.columns = [ "amin", "amax", "badCnt", "totalCnt", "goodCnt", "popPercentage", "badRate" ] test = test.sort_values(by="amax") test["badSign"] = np.sign(test.badRate - test.badRate.shift(-1)) test.iloc[test.shape[0] - 2, 7] = np.nan test["badDistribution"] = test["badCnt"] / test.loc["All", "badCnt"] test["goodDistribution"] = test["goodCnt"] / test.loc["All", "goodCnt"] test["distributedGoodBad"] = test["goodDistribution"] - test[ "badDistribution"] test["WOE"] = np.log(test["goodDistribution"] / test["badDistribution"]) test["IV"] = test["WOE"] * test["distributedGoodBad"] * 100 test.loc["All", "IV"] = np.sum(test["IV"]) test["column"] = column return test[[ "amin", "amax", "popPercentage", "IV", "badRate", "badSign", "column" ]]
def train_model(data, features, label): """ 分别使用逻辑回归、决策树和决策树+逻辑回归建模 """ res = {} train_data, test_data = train_test_split(data, test_size=0.5) # 单独使用逻辑回归 logit_model = LogisticRegression() logit_model.fit(train_data[features], train_data[label]) logit_prob = logit_model.predict_proba(test_data[features])[:, 1] res["logit"] = roc_curve(test_data[label], logit_prob) # 单独使用决策树 dt_model = DecisionTreeClassifier(max_depth=2) dt_model.fit(train_data[features], train_data[label]) dt_prob = dt_model.predict_proba(test_data[features])[:, 1] res["DT"] = roc_curve(test_data[label], dt_prob) # 决策树和逻辑回归联结 # 为了防止过拟合,使用不同的数据训练决策树和逻辑回归 train_DT, train_LR = train_test_split(train_data, test_size=0.5) # 使用决策树对前两个变量做变换 m = 2 _dt = DecisionTreeClassifier(max_depth=2) _dt.fit(train_DT[features[:m]], train_DT[label]) leaf_node = _dt.apply(train_DT[features[:m]]).reshape(-1, 1) coder = OneHotEncoder() coder.fit(leaf_node) new_feature = np.c_[coder.transform( _dt.apply(train_LR[features[:m]]).reshape(-1, 1)).toarray(), train_LR[features[m:]]] _logit = LogisticRegression() _logit.fit(new_feature[:, 1:], train_LR[label]) test_feature = np.c_[coder.transform( _dt.apply(test_data[features[:m]]).reshape(-1, 1)).toarray(), test_data[features[m:]]] dt_logit_prob = _logit.predict_proba(test_feature[:, 1:])[:, 1] res["DT + logit"] = roc_curve(test_data[label], dt_logit_prob) return res
class TreeClassificationTransformer(BaseTransformer): def __init__(self, kwargs={}): """ Doc strings here. """ self.kwargs = kwargs self._is_fitted = False def fit(self, X, y): """ Doc strings here. """ X, y = check_X_y(X, y) # define the ensemble self.transformer = DecisionTreeClassifier(**self.kwargs).fit(X, y) self._is_fitted = True return self def transform(self, X): """ Doc strings here. """ if not self.is_fitted(): msg = ( "This %(name)s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this transformer." ) raise NotFittedError(msg % {"name": type(self).__name__}) X = check_array(X) return self.transformer.apply(X) def is_fitted(self): """ Doc strings here. """ return self._is_fitted
def countDCP(self, df, classes, minimum_impurity_split): values = np.empty([0, 0]) estimator = DecisionTreeClassifier(min_impurity_split=minimum_impurity_split) estimator.fit(df, classes) leafsIndexes = estimator.apply(df, check_input=True) for index, _ in df.iterrows(): suma = 0 value = 0 for leafIndex, _ in df.iterrows(): if leafsIndexes[index] == leafsIndexes[leafIndex]: suma += 1 if classes[index] == classes[leafIndex]: value += 1 values = np.append(values, np.full((1, 1), (value / suma) * -1)) # print("Count DCP for " + repr(index) + ". row of data.") return values
def learnTrees_and_return_segments(depth): global dt global features global targets features = list(df.columns) target_feature = features[-1] features = list(features[:len(features)-1]) targets = df[target_feature].unique() print 'targets:', targets print 'features:', features y=df[target_feature] X=df[features] X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.0,random_state=0) #dt = DecisionTreeRegressor(max_depth=depth)#, min_samples_split=20, random_state=99) dt = DecisionTreeClassifier(max_depth=depth)#, min_samples_split=20, random_state=99) dt.fit(X_train,y_train) prediction = dt.predict(X_train) print 'accuracy:', accuracy_score(y_train,prediction) print 'recall:', recall_score(y_train,prediction,average=None) print 'classification report:' print classification_report(y_train,prediction) print 'num correct:', accuracy_score(y_train,prediction) * len(y_train) print 'num incorrect:', (1-accuracy_score(y_train,prediction)) * len(y_train) print 'R2 Score:', r2_score(y_train,prediction) print 'absolute error:', mean_absolute_error(y_train,prediction)*len(X_train) app = dt.apply(X) uni=np.unique(app) segments_set=[[[]] for i in uni] for i in range(len(app)): index=int(np.where(uni==app[i])[0]) segments_set[index][0].append(i) #segments_set=[copy.copy(segments_set) for i in uni] return targets, segments_set, mean_absolute_error(y_train,prediction)*len(X_train)
def fit(X0, W0, X1, W1, **kwargs): X = np.concatenate([as_features(X0), as_features(X1)]) Y = np.array([0] * W0.size + [1] * W1.size) W = np.concatenate([W0, W1]) T = DecisionTreeClassifier(class_weight="balanced", **kwargs) T.fit(X, Y, sample_weight=W) _, *shape = X0.shape tree = T.tree_ feature = [ np.unravel_index(f, shape) if f >= 0 else None for f in tree.feature ] leaf = T.apply(X) pred = np.empty(tree.node_count) for n in range(tree.node_count): mask = leaf == n w0 = (W * mask * (Y == 0)).sum() + 1e-3 w1 = (W * mask * (Y == 1)).sum() + 1e-3 pred[n] = np.log(w1 / w0) / 2 return DTree(feature, tree.threshold, tree.children_left, tree.children_right, pred)
def countDS(self, df, classes): values = np.empty([0, 0]) estimator = DecisionTreeClassifier() estimator.fit(df, classes) leafsIndexes = estimator.apply(df, check_input=True) leafs = np.zeros(estimator.tree_.node_count) # count number of instances for every leaf for leafIndex in leafsIndexes: leafs[leafIndex] += 1 biggestDisjunct = max(leafs) - 1 # count fraction for every instance for leafIndex in leafsIndexes: values = np.append( values, np.full((1, 1), ((leafs[leafIndex] - 1) / biggestDisjunct) * -1)) return values
def tree_bins_func(self, grps=None, pct_size=None): """ 基于决策树(信息熵)的分组 1.max_grps控制最大分组的个数; 2.pct_size控制每组最低的样本占比 """ tmp = self.raw.copy().dropna() if pct_size is None: smp_size = np.int(len(tmp) * self.argms['pct_size']) + 1 else: smp_size = np.int(len(tmp) * pct_size) + 1 if grps is None: grps = self.argms['max_grps'] #当特征的最大取值占比超过阈值时,不做进一步区分,只分为2组 #以决策树为分组的基准工具 clf = DecisionTreeClassifier(min_samples_leaf=smp_size, max_leaf_nodes=grps) clf.fit(tmp[[self.ft_name]], tmp['label']) tmp['grp_prd'] = clf.apply(tmp[[self.ft_name]]) grp_info = tmp.groupby('grp_prd').min() grp_info.sort_values(self.ft_name, inplace=True, ascending=True) cuts = list(grp_info[self.ft_name]) + [tmp[self.ft_name].max() + 1] cuts = self._smpSizeCheck_real(tmp, cuts, smp_size) self.bins = {self.ft_name: cuts} self.cap_info = { 'max': tmp[self.ft_name].max(), 'min': tmp[self.ft_name].min() } if len(cuts) == 2: self.woe_check = { self.ft_name: 'tree_bins_func_failed!-value biased' } else: self.woe_check = {}
def IV(df, var, target, n_levels_to_factor_threshold=5, calc_type='Categorical', Min_Category_Share=0.05, nbins=10): # cut numeric features if is_numeric( df[var]) and len(df[var].unique()) > n_levels_to_factor_threshold: if calc_type == 'Interval': df = pd.DataFrame({ var: qcut(df[var], q=nbins, duplicates='drop'), target: df[target] }) elif calc_type == 'Categorical': tree = DecisionTreeClassifier( criterion='entropy', min_samples_leaf=int(df.shape[0] * Min_Category_Share), presort=True, random_state=1223) tree.fit(df.loc[~df[var].isna(), [var]], df.loc[~df[var].isna(), target]) tmp = tree.apply(df.loc[~df[var].isna(), [var]]).astype(str) tmp = ['leaf_' + e for e in tmp] tmp2 = ~df[var].isna() df = pd.DataFrame({var: ['NA'] * df.shape[0], target: df[target]}) df.loc[tmp2, var] = tmp # calculate IV rez = WoE_full(df, var, target) rez = sum(rez['WoE'] * (rez['GR'] - rez['BR'])) # return result return rez
OUT_FOLDER = "data/COMPAS/holdout/recidivism_%s.csv" df = pd.read_csv("data/COMPAS/recidivism.csv") X = df.iloc[:, 0:-1] y = df.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.165, random_state=69) clf = DecisionTreeClassifier(random_state=0, min_samples_leaf=300) clf.fit(X, y) l_train = clf.apply(X_train) l_test = clf.apply(X_test) l_indexes = np.unique(l_train) clusters = [] for l_index in l_indexes: c_train_indexes = np.where(l_train == l_index) c_test_indexes = np.where(l_test == l_index) cluster = (l_index, c_train_indexes[0], c_test_indexes[0]) clusters.append(cluster) clusters.sort(key=lambda x: len(x[1])) def store_data(outfile, X, y): df = pd.concat([X, y], axis=1) df.to_csv(outfile)
feature[i], threshold[i], children_right[i], )) print() """ # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. """ node_indicator = tree.decision_path(X) # Similarly, we can also have the leaves ids reached by each sample. leave_id = tree.apply(X) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. #sample_id = 0 #node_index = node_indicator.indices[node_indicator.indptr[sample_id]: # node_indicator.indptr[sample_id + 1]] # #print('Rules used to predict sample %s: ' % sample_id) #for node_id in node_index: # if leave_id[sample_id] == node_id: # continue # # if (X[sample_id, feature[node_id]] <= threshold[node_id]): # threshold_sign = "<="
clf.fit(X_train, y_train) print("showing prediction results (first 10) [1=infected, 0 = non infected]:") y_pred = clf.predict(X_test) print(y_pred[:5]) a = X_test[:1] print("a:") print(a) #print('sk_pred: {}'.format(clf.predict(a))) #print('true: {}'.format(y_test[:3])) # shows the end point of the tree traverse by a sample print("Returns the index of the leaf that each sample is predicted as:") index_of_leaf = clf.apply(a) print(index_of_leaf) #decision path shows the nodes of the tree that were traverse by the sample. print("decision path:") d_path = clf.decision_path(a) print(d_path) print("nodes in the decision path:") n_d_path = np.unique(np.sort(d_path.indices)) print(n_d_path) print("probability of each class:") print(clf.predict_proba(a)) print("Feature importances:")
class TreeClassificationTransformer(BaseTransformer): """ A class used to transform data from a category to a specialized representation. Attributes (object) ---------- kwargs : dict A dictionary to contain parameters of the tree. _is_fitted_ : bool A boolean to identify if the model is currently fitted. Methods ---------- fit(X, y) Fits the transformer to data X with labels y. transform(X) Performs inference using the transformer. is_fitted() Indicates whether the transformer is fitted. """ def __init__(self, kwargs={}): self.kwargs = kwargs self._is_fitted = False def fit(self, X, y): """ Fits the transformer to data X with labels y. Parameters ---------- X : ndarray Input data matrix. y : ndarray Output (i.e. response data matrix). """ X, y = check_X_y(X, y) # define the ensemble self.transformer = DecisionTreeClassifier(**self.kwargs).fit(X, y) self._is_fitted = True return self def transform(self, X): """ Performs inference using the transformer. Parameters ---------- X : ndarray Input data matrix. """ if not self.is_fitted(): msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this transformer.") raise NotFittedError(msg % {"name": type(self).__name__}) X = check_array(X) return self.transformer.apply(X) def is_fitted(self): """ Indicates whether the transformer is fitted. Parameters ---------- None """ return self._is_fitted
# ``decision_path`` method outputs an indicator matrix that allows us to # retrieve the nodes the samples of interest traverse through. A non zero # element in the indicator matrix at position ``(i, j)`` indicates that # the sample ``i`` goes through the node ``j``. Or, for one sample ``i``, the # positions of the non zero elements in row ``i`` of the indicator matrix # designate the ids of the nodes that sample goes through. # # The leaf ids reached by samples of interest can be obtained with the # ``apply`` method. This returns an array of the node ids of the leaves # reached by each sample of interest. Using the leaf ids and the # ``decision_path`` we can obtain the splitting conditions that were used to # predict a sample or a group of samples. First, let's do it for one sample. # Note that ``node_index`` is a sparse matrix. node_indicator = clf.decision_path(X_test) leaf_id = clf.apply(X_test) sample_id = 0 # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id` node_index = node_indicator.indices[ node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1] ] print("Rules used to predict sample {id}:\n".format(id=sample_id)) for node_id in node_index: # continue to the next node if it is a leaf node if leaf_id[sample_id] == node_id: continue # check if value of the split feature for sample 0 is below threshold if X_test[sample_id, feature[node_id]] <= threshold[node_id]:
@author: Samruddhi Somani """ execfile('Original.py') execfile('tfidf.py') from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier import numpy as np #fitting/examining tree s2=DecisionTreeClassifier(max_depth=3, random_state=5) s2.fit(x,cuisine) leaves2=pd.Series(s2.apply(x),name='leaves') idk=pd.concat([cuisine,leaves2],axis=1) m=list(leaves2.value_counts().index.values) #[3, 6, 10, 4, 7, 13, 11, 14] for y in m: print y print idk[leaves2==y]['cuisine'].value_counts() leaves2.value_counts() #============================================================================== # 3 33684: SGD SVM # 6 2991: SGD SVM # 10 1848: Naive Bayes/Logistic Regression # 4 914: Naive Bayes/Logistic Regression # 7 300: Logistic Regression # 13 20: Naive Bayes # 11 14: Naive Bayes
feature[i], threshold[i], children_right[i], )) print() # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. node_indicator = estimator.decision_path(X_test) # Similarly, we can also have the leaves ids reached by each sample. leave_id = estimator.apply(X_test) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[ node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]] print('Rules used to predict sample %s: ' % sample_id) for node_id in node_index: if leave_id[sample_id] != node_id: continue if (X_test[sample_id, feature[node_id]] <= threshold[node_id]): threshold_sign = "<="
# -*- coding: utf-8 -*- execfile('Original.py') execfile('tfidf.py') from sklearn.tree import DecisionTreeClassifier s=DecisionTreeClassifier(max_depth=2, random_state=5) s.fit(x,cuisine) leaves=pd.Series(s.apply(x),name='leaves') idk=pd.concat([cuisine,leaves],axis=1) m=list(leaves.value_counts().index.values) for y in m: print y print idk[leaves==y]['cuisine'].value_counts() leaves.value_counts() s2=DecisionTreeClassifier(max_depth=3, random_state=5) s2.fit(x,cuisine) leaves2=pd.Series(s2.apply(x),name='leaves') idk=pd.concat([cuisine,leaves2],axis=1) m=list(leaves2.value_counts().index.values) for y in m: print y print idk[leaves2==y]['cuisine'].value_counts() leaves2.value_counts() s3=DecisionTreeClassifier(max_leaf_nodes=8, random_state=5,criterion='entropy') s3.fit(x,cuisine)
print( "%snode=%s test node: go to node %s if X[:, %s] <= %ss else to " "node %s." % (node_depth[i] * "\t", i, children_left[i], feature[i], threshold[i], children_right[i]) ) print() # First let's retrieve the decision path of each sample. The decision_path # method allows to retrieve the node indicator functions. A non zero element of # indicator matrix at the position (i, j) indicates that the sample i goes # through the node j. node_indicator = estimator.decision_path(X_test) # Similarly, we can also have the leaves ids reached by each sample. leave_id = estimator.apply(X_test) # Now, it's possible to get the tests that were used to predict a sample or # a group of samples. First, let's make it for the sample. sample_id = 0 node_index = node_indicator.indices[node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]] print("Rules used to predict sample %s: " % sample_id) for node_id in node_index: if leave_id[sample_id] != node_id: continue if X_test[sample_id, feature[node_id]] <= threshold[node_id]: threshold_sign = "<=" else:
#divide each column over sum of rows df_new=df.div(df.sum(axis=0),axis='columns').fillna(0) return df_new def hmwrapper(cm,filename): h=heatmap(cm).get_figure() ax=h.add_subplot(111) ax.set_xlabel('Predictions') h.tight_layout() h.set_size_inches(8,5.5) h.savefig(filename,bbox_inches='tight',dpi=100) #fitting/examining tree s2=DecisionTreeClassifier(max_leaf_nodes=10, min_samples_leaf=500, random_state=5) s2.fit(x,cuisine) leaves2=pd.Series(s2.apply(x),name='leaves') idk=pd.concat([cuisine,leaves2],axis=1) m=list(leaves2.value_counts().index.values) #[3, 6, 10, 4, 7, 13, 11, 14] for y in m: print y print idk[leaves2==y]['cuisine'].value_counts() leaves2.value_counts() #============================================================================== # 3 33684: SGD SVM # 6 2991: SGD SVM # 10 1848: Naive Bayes/Logistic Regression # 4 914: Naive Bayes/Logistic Regression # 7 300: Logistic Regression # 13 20: Naive Bayes # 11 14: Naive Bayes