def test_regress_forest(): """ testing Random forests regression predict function """ n_trees = 4 boston = load_boston() X = boston.data y = boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) # X_train = np.array([range(1,4),range(4,7)]) # y_train = np.array([9,5]) # X_test = X_train # y_test = y_train print('Single regression tree test : ') estimator = DecisionTreeRegressor() estimator.fit(X_train, y_train) y_pred_dt = estimator.predict(X_test) node_indicator = estimator.decision_path(X_train) mean_vals, _ = get_node_means(node_indicator, y_train) test_leaves_id = estimator.apply(X_test) y_pred_mine_dt = mean_vals[test_leaves_id] diff = np.linalg.norm(y_pred_dt - y_pred_mine_dt) print('Tree predictions diff :' + repr(diff)) print('Regression Forest Test : ') forest = get_models('RandomForest', 'regress') forest.set_params(n_estimators=n_trees) forest.fit(X_train, y_train) y_pred_all = np.zeros(shape=(len(y_test))) n_samples = X_train.shape[0] indicator, n_nodes_ptr = forest.decision_path(X_train) for t, estimator in enumerate(forest): t_idx = _generate_sample_indices(estimator.random_state, n_samples) y_tree_predict = estimator.predict(X_test) print('Num nodes = ' + repr(estimator.tree_.node_count)) node_indicator = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]] # node_indicator = estimator.decision_path(X_train) mean_vals, _ = get_node_means(node_indicator, y_train[t_idx]) leaves_id = estimator.apply(X_test) y_tree_mine = mean_vals[leaves_id] diff = np.linalg.norm(y_tree_predict - y_tree_mine) # print(y_tree_predict, y_tree_mine) print('Tree#' + repr(t) + ': Diff = ' + repr(diff)) y_pred_all += y_tree_mine y_pred_rf = forest.predict(X_test) y_pred_mine_rf = y_pred_all / n_trees diff = np.linalg.norm(y_pred_rf - y_pred_mine_rf) print('Forest predictions difference :' + repr(diff)) print('#BUG#-->Trees in the forest dont match my tree predictions') return
def fit(self, X, rels, qids): n_rows = np.shape(X)[0] F = np.zeros(n_rows) # base model i.e., F(x_i) = o_i for m in range(self.num_trees): print(f'building {m}-th tree...') Lambda = np.array([]) Omega = np.array([]) for q in np.unique(qids): rels_q = rels[q == qids] F_q = F[q == qids] Lambda_q, Omega_q = self._calc_results(q, rels_q, F_q) Lambda = np.append(Lambda, Lambda_q) Omega = np.append(Omega, Omega_q) tree = DecisionTreeRegressor(max_depth=self.max_depth) tree.fit(X, Lambda) self.trees.append(tree) leaves = tree.apply(X) # get R_jm to which x_i maps for leaf in np.unique(leaves): # compute scalar gamma I = (leaves == leaf) gamma = np.sum(Lambda[I]) / (np.sum(Omega[I]) + self.eps) # save gamma self.gamma[m, leaf] = gamma # improve the model F += self.lr * I * gamma # evaluate current training NDCGs self.evaluate(X, rels, qids)
def decision_tree_regressor(X, y, labels): regressor = DecisionTreeRegressor(max_depth=3) regressor.fit(X, y) estimates_z = regressor.predict(X) leaves = regressor.apply(X) leaves_hash = np.zeros(np.max(leaves) + 1) for i in range(len(y)): if (estimates_z[i] - y[i]) > 0.05 and estimates_z[i] > 0.6 and y[i] > 0: # print estimates_z[i] # print y[i] # print estimates_z[i]-y[i] # print ((estimates_z[i]-y[i])>0.1 and estimates_z[i]>0 and y[i]>0) # print leaves[i] leaves_hash[leaves[i]] += 1 # print leaves_hash[leaves[i]] else: leaves_hash[-1] += 1 # print regressor.tree_.decision_path(X) print regressor.tree_.feature print regressor.tree_.threshold print leaves_hash print regressor.feature_importances_ visualize_tree(regressor.tree_, labels) return estimates_z
def __clustering(self, X, y=None): """ The clustering procedure of the Optimal Weighted Clustering Gaussian Process. This function should not be called externally """ if self.cluster_method == 'k-mean': clusterer = KMeans(n_clusters=self.n_cluster) clusterer.fit(X) self.cluster_label = clusterer.labels_ self.clusterer = clusterer elif self.cluster_method == 'tree': print("Warning: specified clustering count might be overwritten") minsamples = int(len(X)/(self.n_cluster+1)) tree = DecisionTreeRegressor(random_state=0,min_samples_leaf=minsamples) tree.fit(X,y) labels = tree.apply(X) clusters = np.unique(labels) k = len(clusters) print("leafs:",k) self.n_cluster = k self.leaf_labels = np.unique(labels) self.cluster_label = labels self.clusterer = tree elif self.cluster_method == 'random': r = self.n_sample % self.n_cluster m = (self.n_sample - r) / self.n_cluster self.cluster_label = array(list(range(self.n_cluster)) * m + list(range(r))) self.clusterer = None shuffle(self.cluster_label) elif self.cluster_method == 'GMM': #GMM from sklearn self.clusterer = GMM(n_components=self.n_cluster, n_iter=1000) self.clusterer.fit(X) self.cluster_labels_proba = self.clusterer.predict_proba(X) self.cluster_label = self.clusterer.predict(X) elif self.cluster_method == 'fuzzy-c-mean': #Fuzzy C-means from sklearn cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X.T, self.n_cluster, 2, error=0.000005, maxiter=10000, init=None) self.clusterer = cntr #save the centers for cmeans_predict self.cluster_labels_proba = u.T self.cluster_labels_proba = np.array(self.cluster_labels_proba) self.cluster_label = np.argmax(u, axis=0) self.cluster_label = np.array(self.cluster_label) elif self.cluster_method == 'flame': #Flame clustering, files are attached print("Warning: specified clustering count will be overwritten with Flame") flameobject = flame.Flame_New() tempdata = X.astype(np.float32) N = len(tempdata) flameobject = flame.Flame_New() flame.Flame_SetDataMatrix( flameobject, tempdata, 0 ) flame.Flame_DefineSupports( flameobject, self.flame_knn, self.flame_threshold ) #knn is number of neighbours cso_count = flameobject.cso_count #print "done, found ", cso_count, " clusters" k = cso_count+1 #!!! overwrite k here self.n_cluster = k print("clusters:",k) flame.Flame_LocalApproximation( flameobject, 500, 1e-6 ) self.cluster_labels_proba = flame.Print_Clusters(flameobject, (cso_count+1)*N ) flame.Flame_Clear(flameobject) self.cluster_labels_proba = self.cluster_labels_proba.reshape(( N,cso_count+1 )) self.clusterer = None #we need to assign something
def time_zone(self, var, correct): """Automatic detection of which files might have had their time zone incorrectly set. Not guaranteed to work, and certianly not with only a few input files. Based on computing the phase of the daily cycle by projection onto a single daily complex exponential. The correction amount is fixed for now to the 5 hour difference between LA and Greenland. :param var: the input :class:`~pandas.DataFrame` of the type constructed for the :attr:`.var` attribute :param correct: either ``True`` or ``False`` to indicate whether correction should be attempted, or a list of integer column indexes corresponding to the columns in :attr:`.var` which should be corrected :type correct: :obj:`bool` or :obj:`list` :returns: a corrected DataFrame with added :class:`~pandas.MultiIndex` level containing the time correction in hours :rtype: :class:`~pandas.DataFrame` """ if correct is False: return pd.concat((var, ), 1, keys=[0], names=['time_adj'] + var.columns.names) elif isinstance(correct, list): a = var.iloc[:correct] b = var.drop(a.columns, 1) else: phase = var.apply(self.phase, 0) i = np.arange(var.shape[1]).reshape((-1, 1)) tr = DecisionTreeRegressor(max_leaf_nodes=2).fit(i, phase) cl = tr.apply(i) a = var.iloc[:, cl == 1] b = var.iloc[:, cl == 2] a.index = a.index + pd.Timedelta(5, 'h') print( "\nThe following files' timestamps have been changed by 5 hours:\n" ) for f in a.columns.get_level_values('file'): print(f) return pd.concat((a, b), 1, keys=[5, 0], names=['time_adj'] + var.columns.names)
def get_linjie_matrix(x_train,y_train): all_all_num = [] for rr in range(y_train.shape[1]): delta_loss = y_train[:,rr].copy() all_result = [] all_weight= [] num_epch = config.all_config[text_file].get('adjacency_num') for qq in range(num_epch): estimator = DecisionTreeRegressor(max_depth=config.all_config[text_file].get('adjacency_cart_depth')).fit(x_train,delta_loss) leave_id = estimator.apply(x_train) result = [] for i in range(len(leave_id)): temp_one = [] for j in range(len(leave_id)): if (leave_id[i] == leave_id[j]): temp_one.append(1) else: temp_one.append(0) result.append(temp_one) result = np.array(result) pre = estimator.predict(x_train) delta_loss = delta_loss-pre all_weight.append(sum(abs(delta_loss))) all_result.append(result) all_num = (np.sum(all_weight)-all_weight[0])/np.sum(all_weight)*all_result[0] for qq in range(1,num_epch): all_num += (np.sum(all_weight)-all_weight[qq])/np.sum(all_weight)*all_result[qq] all_all_num.append(all_num) cos_result = target_cos(y_train) return all_all_num,cos_result
def train(joint_id, X, y, model_dir, min_samples_leaf=400, load_models=args.load_model): """Trains a regressor tree on the unit directions towards the joint. @params: joint_id : current joint id X : samples feature array (N x num_samples x num_feats) y : samples unit direction vectors (N x num_samples x 3) min_samples_split : minimum number of samples required to split an internal node load_models : load trained models from disk (if exist) """ logger.debug('Start training %s model...', JOINT_NAMES[joint_id]) regressor_path = os.path.join(model_dir, 'regressor' + str(joint_id) + '.pkl') L_path = os.path.join(model_dir, 'L' + str(joint_id) + '.pkl') # Load saved model from disk if load_models and (os.path.isfile(regressor_path) and os.path.isfile(L_path)): logger.debug('Loading model %s from files...', JOINT_NAMES[joint_id]) regressor = pickle.load(open(regressor_path, 'rb')) L = pickle.load(open(L_path, 'rb')) return regressor, L X_reshape = X.reshape(X.shape[0] * X.shape[1], X.shape[2]) # (N x num_samples, num_feats) y_reshape = y.reshape(y.shape[0] * y.shape[1], y.shape[2]) # (N x num_samples, 3) # Count the number of valid (non-zero) samples valid_rows = np.logical_not(np.all(X_reshape == 0, axis=1)) # inverse of invalid samples logger.debug('Model %s - Valid samples: %d / %d', JOINT_NAMES[joint_id], X_reshape[valid_rows].shape[0], X_reshape.shape[0]) # Fit decision tree to samples regressor = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf) regressor.fit(X_reshape[valid_rows], y_reshape[valid_rows]) L = stochastic(regressor, X_reshape, y_reshape) # Print statistics on leafs leaf_ids = regressor.apply(X_reshape) bin = np.bincount(leaf_ids) unique_ids = np.unique(leaf_ids) biggest = np.argmax(bin) smallest = np.argmin(bin[bin != 0]) logger.debug('Model %s - # Leaves: %d', JOINT_NAMES[joint_id], unique_ids.shape[0]) logger.debug('Model %s - Smallest Leaf ID: %d, # Samples: %d/%d', JOINT_NAMES[joint_id], smallest, bin[bin != 0][smallest], np.sum(bin)) logger.debug('Model %s - Biggest Leaf ID: %d, # Samples: %d/%d', JOINT_NAMES[joint_id], biggest, bin[biggest], np.sum(bin)) logger.debug('Model %s - Average Leaf Size: %d', JOINT_NAMES[joint_id], np.sum(bin) / unique_ids.shape[0]) # Save models to disk pickle.dump(regressor, open(regressor_path, 'wb')) pickle.dump(L, open(L_path, 'wb')) return regressor, L
def honestTree(self, treePredTrain, treeRespTrain, predTest): N = treePredTrain.shape[0] idx = random.sample(range(N), N // 2) pred1 = treePredTrain[idx, ...] resp1 = treeRespTrain[idx] pred2 = np.delete(treePredTrain, idx, 0) resp2 = np.delete(treeRespTrain, idx, 0) tree = DecisionTreeRegressor(min_samples_split=2, min_samples_leaf=1, min_impurity_decrease=0.0001, random_state=self.randomState) tree.fit(pred1, resp1) predTestNode = tree.apply(predTest) predTest = np.column_stack((predTest, predTestNode)) predTrainSplitNode = tree.apply(pred2) predTrainSplitComp = np.column_stack((predTrainSplitNode, resp2)) aggPredNode = np.unique(predTrainSplitNode) aggPredAvg = np.array([ np.mean(predTrainSplitComp[predTrainSplitComp[..., 0] == node, 1]) for node in aggPredNode ]) aggPred = np.column_stack((aggPredNode, aggPredAvg)) if (np.unique(tree.apply(pred1)).size != aggPredNode.size): classTree0 = np.setdiff1d(np.unique(tree.apply(pred1)), aggPredNode) append = np.column_stack( (classTree0, np.array([0.5] * classTree0.size))) aggPred = np.vstack((aggPred, append)) idx = np.array( [np.where(aggPred[..., 0] == node)[0][0] for node in predTestNode]) predTest = np.column_stack((predTest, ((aggPred[..., 1])[idx]))) return predTest[..., -1]
def test_leaf_node_kernel_matches_decision_tree(): """Test the leaf node kernel matches the predictions of a single regression tree.""" boston = load_boston() tree = DecisionTreeRegressor(max_depth=3, random_state=123).fit(boston.data, boston.target) leaves = tree.apply(boston.data).reshape(-1, 1) # predictions using tree kernel K = leaf_node_kernel(leaves) K /= K.sum(axis=1) k_pred = np.dot(K, boston.target) y_pred = tree.predict(boston.data) np.testing.assert_allclose(k_pred, y_pred)
def trainModel(X, y, jointID, modelsDir, outDir, loadModels=False): regressor, L = None, None mkdir(outDir + modelsDir) regressorPath = outDir + modelsDir + '/regressor' + str(jointID) + '.pkl' LPath = outDir + modelsDir + '/L' + str(jointID) + '.pkl' if loadModels and os.path.isfile(regressorPath) and os.path.isfile(LPath): logger.debug('loading model %s from files...', jointName[jointID]) regressor = pickle.load(open(regressorPath, 'rb')) L = pickle.load(open(LPath, 'rb')) else: logger.debug('start training model %s...', jointName[jointID]) regressor = DecisionTreeRegressor(min_samples_leaf=minSamplesLeaf) X_reshape = X.reshape(X.shape[0] * X.shape[1], X.shape[2]) y_reshape = y.reshape(y.shape[0] * y.shape[1], y.shape[2]) rows = np.logical_not(np.all(X_reshape == 0, axis=1)) regressor.fit(X_reshape[rows], y_reshape[rows]) logger.debug('model %s - valid samples: %d/%d', jointName[jointID], \ X_reshape[rows].shape[0], X_reshape.shape[0]) leafIDs = regressor.apply(X_reshape) bin = np.bincount(leafIDs) uniqueIDs = np.unique(leafIDs) biggest = np.argmax(bin) smallest = np.argmin(bin[bin != 0]) logger.debug('model %s - #leaves: %d', jointName[jointID], \ uniqueIDs.shape[0]) logger.debug('model %s - biggest leaf id: %d, #samples: %d/%d', \ jointName[jointID], biggest, bin[biggest], np.sum(bin)) logger.debug('model %s - smallest leaf id: %d, #samples: %d/%d', \ jointName[jointID], smallest, bin[bin != 0][smallest], \ np.sum(bin)) logger.debug('model %s - average leaf size: %d', jointName[jointID], \ np.sum(bin)/uniqueIDs.shape[0]) L = stochastic(regressor, X_reshape, y_reshape) pickle.dump(regressor, open(regressorPath, 'wb')) pickle.dump(L, open(LPath, 'wb')) return (regressor, L)
def _get_fitted_model(self, X, y): model = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, ccp_alpha=self.ccp_alpha, ) self.model_ = model.fit(X, y) self.train_leaf_indices_ = model.apply(X)
def fit(self, X, y, verbose=False): log_odds, initial_probability = self.initial_guess(y) n_samples = len(y) predictions = np.full((n_samples, ), log_odds) self.initial_prediction = log_odds predicted_probabilities = np.full((n_samples, ), initial_probability) observed_probabilities = np.array([ (1.0 if label == self.target_classes[0] else 0.0) for label in y ]) for i in range(self.n_estimators): if verbose: loss = -np.sum(observed_probabilities * np.log(predicted_probabilities) + (1 - observed_probabilities) * np.log(1 - predicted_probabilities)) print("Building tree " + str(i + 1) + ", Loss: " + str(loss)) # calculate the residuals residuals = observed_probabilities - predicted_probabilities # fit a tree to the residuals tree = DecisionTreeRegressor(max_leaf_nodes=self.max_leaf_nodes, max_features=self.max_features) # subsampling, stochastic gradient boosting train_set = X target_set = residuals if self.subsample < 1.0: train_set, target_set = self.sub_sample( train_set, target_set, n_samples) tree.fit(train_set, target_set) leaf_indices = tree.apply(X) # for each leaf calculate the output value for that leaf leaf_outputs = self.calculate_leaf_outputs( residuals, leaf_indices, predicted_probabilities) self.trees.append(tree) self.tree_leaf_outputs.append(leaf_outputs) # make new prediction for each sample predicted_probabilities, predictions = self.calculate_new_predicitions( predictions, leaf_indices, leaf_outputs)
def trainModel(X, y, jointID, modelsDir, outDir, loadModels=False): regressor, L = None, None mkdir(outDir+modelsDir) regressorPath = outDir + modelsDir + '/regressor' + str(jointID) + '.pkl' LPath = outDir + modelsDir + '/L' + str(jointID) + '.pkl' if loadModels and os.path.isfile(regressorPath) and os.path.isfile(LPath): logger.debug('loading model %s from files...', jointName[jointID]) regressor = pickle.load(open(regressorPath, 'rb')) L = pickle.load(open(LPath, 'rb')) else: logger.debug('start training model %s...', jointName[jointID]) regressor = DecisionTreeRegressor(min_samples_leaf=minSamplesLeaf) X_reshape = X.reshape(X.shape[0]*X.shape[1], X.shape[2]) y_reshape = y.reshape(y.shape[0]*y.shape[1], y.shape[2]) rows = np.logical_not(np.all(X_reshape == 0, axis=1)) regressor.fit(X_reshape[rows], y_reshape[rows]) logger.debug('model %s - valid samples: %d/%d', jointName[jointID], \ X_reshape[rows].shape[0], X_reshape.shape[0]) leafIDs = regressor.apply(X_reshape) bin = np.bincount(leafIDs) uniqueIDs = np.unique(leafIDs) biggest = np.argmax(bin) smallest = np.argmin(bin[bin != 0]) logger.debug('model %s - #leaves: %d', jointName[jointID], \ uniqueIDs.shape[0]) logger.debug('model %s - biggest leaf id: %d, #samples: %d/%d', \ jointName[jointID], biggest, bin[biggest], np.sum(bin)) logger.debug('model %s - smallest leaf id: %d, #samples: %d/%d', \ jointName[jointID], smallest, bin[bin != 0][smallest], \ np.sum(bin)) logger.debug('model %s - average leaf size: %d', jointName[jointID], \ np.sum(bin)/uniqueIDs.shape[0]) L = stochastic(regressor, X_reshape, y_reshape) pickle.dump(regressor, open(regressorPath, 'wb')) pickle.dump(L, open(LPath, 'wb')) return (regressor, L)
class TreeRegressionTransformer(BaseTransformer): def __init__(self, kwargs={}): """ Doc strings here. """ self.kwargs = kwargs self._is_fitted = False def fit(self, X, y): """ Doc strings here. """ X, y = check_X_y(X, y) # define the ensemble self.transformer = DecisionTreeRegressor(**self.kwargs).fit(X, y) self._is_fitted = True return self def transform(self, X): """ Doc strings here. """ if not self.is_fitted(): msg = ( "This %(name)s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this transformer." ) raise NotFittedError(msg % {"name": type(self).__name__}) X = check_array(X) return self.transformer.apply(X) def is_fitted(self): """ Doc strings here. """ return self._is_fitted
def test_tree_regress(): """ test to predict for single tree """ boston = load_boston() X = boston.data y = boston.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) estimator = DecisionTreeRegressor() estimator.fit(X_train, y_train) node_indicator = estimator.decision_path(X_train) mean_vals, _ = get_node_means(node_indicator, y_train) y_pred_dt = estimator.predict(X_test) test_leaves_id = estimator.apply(X_test) y_pred_mine_dt = mean_vals[test_leaves_id] diff = np.linalg.norm(y_pred_dt - y_pred_mine_dt) print('Tree predictions diff :' + repr(diff)) return
def fit(self, X, relevence, qid): F = np.zeros(np.shape(X)[0]) eps = 0.000001 for k in range(self.num_trees): lambda_arr = np.array([]) omega_arr = np.array([]) for unique_qid in np.unique(qid): qid_lambda, qid_omega = self._calculate_lambda( relevence[qid == unique_qid], F[qid == unique_qid], unique_qid) lambda_arr = np.append(lambda_arr, qid_lambda) omega_arr = np.append(omega_arr, qid_omega) tree = DecisionTreeRegressor(max_depth=self.max_depth) tree.fit(X, lambda_arr) self.trees.append(tree) leaves = tree.apply(X) for leaf in np.unique(leaves): leaf_idx = (leaves == leaf) self.gamma[k, leaf] = np.sum( lambda_arr[leaf_idx]) / (np.sum(omega_arr[leaf_idx]) + eps) F += self.lr * leaf_idx * self.gamma[k, leaf]
def train(X, y, ntrees = 10, alpha = 0.1, mode='gbdt', epoches=20): """ 训练模型 :param X: 特征 :param y: 标签 :param ntrees: 树的棵树 :param alpha: 学习率 :param mode: 学习模式, gbdt 一阶算法, xgboost 二阶算法 :return: 返回参数 trees 返回回归树列表 """ #初始化f0 f = np.log(1e-5 + (np.sum(y)/np.sum(1.0-y)))*np.ones((y.shape[0],)) #初始化残差为样本值 r = y #params保留构建好的树及叶子节点值 params = [] for i in range(0, ntrees): tmp_tree = DecisionTreeRegressor(max_depth=1) tmp_tree.fit(X, r) # 计算残差 r = cal_residual(f, y) leaf_indexes = tmp_tree.apply(X) #计算叶子节点的值 leaf_val = cal_leaf_val(r, leaf_indexes) params.append((tmp_tree, leaf_val)) tmp_val = map(lambda x:leaf_val[x], leaf_indexes) #更新f值 f += alpha * np.array(tmp_val) #计算loss并输出 loss=np.log(1+np.exp(-y*f)) print ('print res:',r,'print loss:',np.mean(np.sum(loss))) #返回树和叶子节点值 return params
def fit(self, X_train, Y_train): self.regressors = [] self.init_log_odd = 0 self.transform_y = False if 1 in Y_train.unique() and -1 in Y_train.unique(): Y_train = Y_train.apply(lambda x: 0 if x == -1 else 1) self.transform_y = True f0 = np.log(np.sum(Y_train == 1) / np.sum(Y_train == 0)) self.init_log_odd = f0 current_log_odds = pd.Series(f0, index=Y_train.index) current_gradient = Y_train - expit(current_log_odds.ravel()) #current_gradient = Y_train - (np.exp(current_log_odds)/(1+np.exp(current_log_odds))) for i in range(self.n_estimators): rt = DecisionTreeRegressor(max_depth=self.max_depth) rt.fit(X_train, current_gradient) terminal_regions = rt.apply(X_train).copy() for leaf in np.where(rt.tree_.children_left == TREE_LEAF)[0]: terminal_region = np.where(terminal_regions == leaf)[0] residual = current_gradient.take(terminal_region, axis=0) y = Y_train.take(terminal_region, axis=0) numerator = np.sum(residual) denominator = np.sum((y - residual) * (1 - y + residual)) if abs(denominator) < 1e-150: rt.tree_.value[leaf, 0, 0] = 0.0 else: rt.tree_.value[leaf, 0, 0] = numerator / denominator self.regressors.append(rt) current_log_odds += ( self.shrinkage_parameter * rt.tree_.value[:, 0, 0].take(terminal_regions, axis=0)) current_gradient = Y_train - expit(current_log_odds.ravel())
def fit(self, X, y, query_ids): """ Fits the model on the training data. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Feature representation of each document. y : array-like of shape (n_samples,) Relevance scores for each document in query. Must be numeric. Preferably {0, 1, 2, 3, 4} query_ids : array-like of shape (n_samples,) Query ids for given documents. Single query ids must go successively. Returns ------- self : LambdaMART Fitted model. """ assert X.shape[0] == len(y) n_samples = X.shape[0] y_by_query = group_by_ids(y, query_ids) model_scores_by_query = [np.zeros(len(scores)) for scores in y_by_query] max_dcg_by_query = [max_dcg_score(scores) for scores in y_by_query] # max_dcg_at_k(scores, self.dcg_k) for k in tqdm(range(self.n_trees)): lambdas, w = np.zeros(n_samples), np.zeros(n_samples) doc_idx = 0 for y, model_scores, max_DCG in zip(y_by_query, model_scores_by_query, max_dcg_by_query): n_docs = len(y) doc_ranks_predicted = np.zeros(n_docs, dtype=np.int64) doc_ranks_predicted[(-model_scores).argsort()] = np.arange(n_docs) for y_i, s_i, rank_i in zip(y, model_scores, doc_ranks_predicted): indices_j = (y != y_i) y_j, s_j, rank_j = y[indices_j], model_scores[indices_j], doc_ranks_predicted[indices_j] delta_DCG = np.abs( (np.power(2, y_i) - np.power(2, y_j)) * (1. / np.log2(rank_i + 2.) - 1. / np.log2(rank_j + 2.)) ) rho_i_j = 1. / (1. + np.exp(np.abs(s_i - s_j))) lambda_i_j = -rho_i_j * delta_DCG lambda_i = (np.sign(y_i - y_j) * lambda_i_j).sum() / max_DCG w_i = (rho_i_j * (1 - rho_i_j) * delta_DCG).sum() / max_DCG lambdas[doc_idx], w[doc_idx] = lambda_i, w_i doc_idx += 1 tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_leaf=10) tree.fit(X, lambdas) model_scores = np.concatenate(model_scores_by_query) leaf_by_doc_index = tree.apply(X) for leaf in set(leaf_by_doc_index): one_leaf_docs_indices = np.where(leaf_by_doc_index == leaf)[0] gamma_l_k = lambdas[one_leaf_docs_indices].sum() / w[one_leaf_docs_indices].sum() tree.tree_.value[leaf] = -gamma_l_k * self.learning_rate model_scores[one_leaf_docs_indices] -= gamma_l_k * self.learning_rate model_scores_by_query = group_by_ids(model_scores, query_ids) self.trees.append(tree)
class DecisionTreeRegressionModel(RegressionModel): """ Wraps sklearn's DecisionTreeRegressor. TODO: Beef up the RegressionModel base class and actually enforce a consistent interface. TODO: See how much boilerplate we can remove from model creation. """ _PREDICTOR_OUTPUT_COLUMNS = [ Prediction.LegalColumnNames.IS_VALID_INPUT, Prediction.LegalColumnNames.PREDICTED_VALUE, Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE, Prediction.LegalColumnNames.SAMPLE_VARIANCE, Prediction.LegalColumnNames.SAMPLE_SIZE, Prediction.LegalColumnNames.PREDICTED_VALUE_DEGREES_OF_FREEDOM ] def __init__(self, model_config: Point, input_space: Hypergrid, output_space: Hypergrid, logger=None): if logger is None: logger = create_logger("DecisionTreeRegressionModel") self.logger = logger assert model_config in decision_tree_config_store.parameter_space RegressionModel.__init__(self, model_type=type(self), model_config=model_config, input_space=input_space, output_space=output_space) self._input_space_adapter = CategoricalToDiscreteHypergridAdapter( adaptee=self.input_space) self.input_dimension_names = [ dimension.name for dimension in self._input_space_adapter.dimensions ] self.target_dimension_names = [ dimension.name for dimension in self.output_space.dimensions ] self.logger.debug( f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}." ) assert len( self.target_dimension_names ) == 1, "For now (and perhaps forever) we only support single target per tree." self._regressor = DecisionTreeRegressor( criterion=self.model_config.criterion, splitter=self.model_config.splitter, max_depth=self.model_config.max_depth if self.model_config.max_depth != 0 else None, min_samples_split=self.model_config.min_samples_split, min_samples_leaf=self.model_config.min_samples_leaf, min_weight_fraction_leaf=self.model_config. min_weight_fraction_leaf, max_features=self.model_config.max_features, random_state=self.model_config.get("random_state", None), max_leaf_nodes=self.model_config.max_leaf_nodes if self.model_config.max_leaf_nodes not in (0, 1) else None, min_impurity_decrease=self.model_config.min_impurity_decrease, ccp_alpha=self.model_config.ccp_alpha) # These are used to compute the variance in predictions self._observations_per_leaf = dict() self._mean_per_leaf = dict() self._mean_variance_per_leaf = dict() self._sample_variance_per_leaf = dict() self._count_per_leaf = dict() self._trained = False @property def trained(self): return self._trained @property def num_observations_used_to_fit(self): return self.last_refit_iteration_number def should_fit(self, num_samples): """ Returns true if the model should be fitted. This model should be fitted under the following conditions: 1) It has not been fitted yet and num_samples is larger than min_samples_to_fit 2) The model has been fitted and the number of new samples is larger than n_new_samples_before_refit :param num_samples: :return: """ if not self.trained: return num_samples > self.model_config.min_samples_to_fit num_new_samples = num_samples - self.num_observations_used_to_fit return num_new_samples >= self.model_config.n_new_samples_before_refit @trace() def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration_number): self.logger.debug( f"Fitting a {self.__class__.__name__} with {len(feature_values_pandas_frame.index)} observations." ) # Let's get the numpy arrays out of the panda frames # feature_values_pandas_frame = self._input_space_adapter.project_dataframe( feature_values_pandas_frame, in_place=False) feature_values = feature_values_pandas_frame[ self.input_dimension_names].to_numpy() target_values = target_values_pandas_frame[ self.target_dimension_names].to_numpy() # Clean up state before fitting again self._observations_per_leaf = dict() self._regressor.fit(feature_values, target_values) # Now that we have fit the model we can augment our tree by computing the variance # TODO: this code can be easily optimized, but premature optimization is the root of all evil. node_indices = self._regressor.apply(feature_values) self.logger.debug( f"The resulting three has {len(node_indices)} leaf nodes.") for node_index, sample_target_value in zip(node_indices, target_values): observations_at_leaf = self._observations_per_leaf.get( node_index, []) observations_at_leaf.append(sample_target_value) self._observations_per_leaf[node_index] = observations_at_leaf # Now let's compute all predictions for node_index in self._observations_per_leaf: # First convert the observations to a numpy array. observations_at_leaf = np.array( self._observations_per_leaf[node_index]) self._observations_per_leaf[node_index] = observations_at_leaf leaf_mean = np.mean(observations_at_leaf) leaf_sample_variance = np.var( observations_at_leaf, ddof=1 ) # ddof = delta degrees of freedom. We want sample variance. leaf_mean_variance = leaf_sample_variance / len( observations_at_leaf) self._mean_per_leaf[node_index] = leaf_mean self._mean_variance_per_leaf[node_index] = leaf_mean_variance self._sample_variance_per_leaf[node_index] = leaf_sample_variance self._count_per_leaf[node_index] = len(observations_at_leaf) self._trained = True self.last_refit_iteration_number = iteration_number @trace() def predict(self, feature_values_pandas_frame, include_only_valid_rows=True): self.logger.debug( f"Creating predictions for {len(feature_values_pandas_frame.index)} samples." ) # dataframe column shortcuts is_valid_input_col = Prediction.LegalColumnNames.IS_VALID_INPUT.value predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_value_var_col = Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE.value sample_var_col = Prediction.LegalColumnNames.SAMPLE_VARIANCE.value sample_size_col = Prediction.LegalColumnNames.SAMPLE_SIZE.value dof_col = Prediction.LegalColumnNames.PREDICTED_VALUE_DEGREES_OF_FREEDOM.value valid_rows_index = None features_df = None if self.trained: valid_features_df = self.input_space.filter_out_invalid_rows( original_dataframe=feature_values_pandas_frame, exclude_extra_columns=True) features_df = self._input_space_adapter.project_dataframe( valid_features_df, in_place=False) valid_rows_index = features_df.index predictions = Prediction( objective_name=self.target_dimension_names[0], predictor_outputs=self._PREDICTOR_OUTPUT_COLUMNS, dataframe_index=valid_rows_index) prediction_dataframe = predictions.get_dataframe() if valid_rows_index is not None and not valid_rows_index.empty: prediction_dataframe['leaf_node_index'] = self._regressor.apply( features_df.loc[valid_rows_index].to_numpy()) prediction_dataframe[predicted_value_col] = prediction_dataframe[ 'leaf_node_index'].map(self._mean_per_leaf) prediction_dataframe[ predicted_value_var_col] = prediction_dataframe[ 'leaf_node_index'].map(self._mean_variance_per_leaf) prediction_dataframe[sample_var_col] = prediction_dataframe[ 'leaf_node_index'].map(self._sample_variance_per_leaf) prediction_dataframe[sample_size_col] = prediction_dataframe[ 'leaf_node_index'].map(self._count_per_leaf) prediction_dataframe[ dof_col] = prediction_dataframe[sample_size_col] - 1 prediction_dataframe[is_valid_input_col] = True prediction_dataframe.drop(columns=['leaf_node_index'], inplace=True) predictions.validate_dataframe(prediction_dataframe) if not include_only_valid_rows: predictions.add_invalid_rows_at_missing_indices( desired_index=feature_values_pandas_frame.index) return predictions
def _train_honest_tree(self, df, y_var, w_var, index_cols, min_samples_leaf): """ function that effectively trains each tree in the forest """ if self.algorithm == 'double_sample': # step 0 : subsample of df to populate I and J df_sample, df_not_sample = train_test_split(df, test_size=0.2) df_out = df_not_sample.set_index(index_cols).drop( y_var + w_var * (not self.use_w_in_tree), 1) y_out = df_not_sample.set_index(index_cols)[y_var] W_out = df_not_sample.set_index(index_cols)[w_var] s = 0.5 elif self.algorithm == 'propensity': df_sample = df s = np.random.uniform(0.3, 0.5) # step 1 : splitting (J = train, I = predictions) J, I, tau_J, tau_I, W_J, W_I = train_test_split( df_sample.set_index(index_cols).drop( y_var + w_var * (not self.use_w_in_tree), 1), df_sample.set_index(index_cols)[y_var], df_sample.set_index(index_cols)[w_var], test_size=s) # step 2 : training the tree if self.algorithm == 'double_sample': if not self.true_honest_tree: model = DecisionTreeRegressor(criterion='mse', min_samples_leaf=2 * min_samples_leaf) else: model = decision_tree.DecisionTree(min_samples_leaf) # J is used for training model.fit(J, tau_J) # I is used for prediction and pruning model = self._prune_tree(model, I, W_I, w_var, min_samples_leaf) X_prediction, tau_prediction, W_prediction = I, tau_I, W_I elif self.algorithm == 'propensity': model = ExtraTreeClassifier(criterion='gini', min_samples_leaf=2 * min_samples_leaf, splitter='random') # we use J for training, but this time the target is the treament class variable model.fit(J, W_J) # pruning and prediction in J model = self._prune_tree(model, I, W_I, w_var, min_samples_leaf) X_prediction, tau_prediction, W_prediction = J, tau_J, W_J # creating a dataframe with the predictions by leaf leaves = X_prediction[[]].copy() leaves['leaf'] = model.apply(X_prediction) leaves['true'] = tau_prediction leaves[w_var] = W_prediction leaves = leaves.groupby(['leaf'] + w_var).true.mean().reset_index() # predicting if self.full_predictor and self.algorithm == 'double_sample': # if full, we predict for everyone X_prediction = pd.concat([df_out, X_prediction]) this_preds = X_prediction[[]].copy() this_preds['leaf'] = model.apply(X_prediction) this_preds.reset_index(inplace=True) return leaves, this_preds, model
class GroupPCADecisionTreeRegressor(BaseEstimator, RegressorMixin): """ PCA on random group of features followed by a Decision Tree See : GroupPCA and DecisionTreeRegressor """ def __init__( self, criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, pca_bootstrap=False, pca_max_nb_groups=0.25, pca_max_group_size=0.05, ): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.random_state = random_state self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.presort = presort self.pca_bootstrap = pca_bootstrap self.pca_max_nb_groups = pca_max_nb_groups self.pca_max_group_size = pca_max_group_size self._tree = None self._group_pca = None def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): self.n_features_ = X.shape[1] # 1) create GroupPCA self._group_pca = GroupPCA( random_state=self.random_state, bootstrap=self.pca_bootstrap, max_nb_groups=self.pca_max_nb_groups, max_group_size=self.pca_max_group_size, ) # 2) Create Tree self._tree = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=self.random_state, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, presort=self.presort, ) # 3) Apply group PCA Xpca = self._group_pca.fit_transform(X, y) # 4) fit Tree self._tree.fit(Xpca, y, sample_weight=sample_weight, check_input=check_input, X_idx_sorted=None) return self def predict(self, X, check_input=True): if self._tree is None: raise NotFittedError("You should fit the model first") Xpca = self._group_pca.transform(X) return self._tree.predict(Xpca, check_input=check_input) def apply(self, X, check_input=True): if self._tree is None: raise NotFittedError("You should fit the model first") Xpca = self._group_pca.transform(X) return self._tree.apply(Xpca, check_input=check_input) def decision_path(self, X, check_input=True): Xpca = self._group_pca.transform(X) return self._tree.decision_path(Xpca, check_input=check_input) @property def tree_(self): return self._tree.tree_ def _validate_X_predict(self, X, check_input): """Validate X whenever one tries to predict, apply, predict_proba""" if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csr") if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based " "sparse matrices") n_features = X.shape[1] if self.n_features_ != n_features: raise ValueError( "Number of features of the model must " "match the input. Model n_features is %s and " "input n_features is %s " % (self.n_features_, n_features) ) return X
def train(joint_id, X, y, model_dir, samples_leaf, k_value, num_samples, xy_offset): """Trains a regressor tree on the unit directions towards the joint. @params: joint_id : current joint id X : samples feature array (N x num_samples x num_feats) y : samples unit direction vectors (N x num_samples x 3) min_samples_split : minimum number of samples required to split an internal node load_models : load trained models from disk (if exist) """ logger.debug('Start training %s model...', JOINT_NAMES[joint_id]) #regressor_path = os.path.join(model_dir, 'regressor' + str(joint_id) + '.pkl') #L_path = os.path.join(model_dir, 'L' + str(joint_id) + '.pkl') X_reshape = X.reshape(X.shape[0] * X.shape[1], X.shape[2]) # (N x num_samples, num_feats) y_reshape = y.reshape(y.shape[0] * y.shape[1], y.shape[2]) # (N x num_samples, 3) # Count the number of valid (non-zero) samples valid_rows = np.logical_not(np.all(X_reshape == 0, axis=1)) # inverse of invalid samples logger.debug('Model %s - Valid samples: %d / %d', JOINT_NAMES[joint_id], X_reshape[valid_rows].shape[0], X_reshape.shape[0]) #regressor = joblib.load(regressor_path) #L = joblib.load(L_path) # Fit decision tree to samples regressor = DecisionTreeRegressor(min_samples_leaf=samples_leaf) regressor.fit(X_reshape[valid_rows], y_reshape[valid_rows]) L = stochastic(regressor, X_reshape, y_reshape, k_value) # Print statistics on leafs leaf_ids = regressor.apply(X_reshape) bin = np.bincount(leaf_ids) unique_ids = np.unique(leaf_ids) biggest = np.argmax(bin) smallest = np.argmin(bin[bin != 0]) logger.debug('Model %s - # Leaves: %d', JOINT_NAMES[joint_id], unique_ids.shape[0]) logger.debug('Model %s - Smallest Leaf ID: %d, # Samples: %d/%d', JOINT_NAMES[joint_id], smallest, bin[bin != 0][smallest], np.sum(bin)) logger.debug('Model %s - Biggest Leaf ID: %d, # Samples: %d/%d', JOINT_NAMES[joint_id], biggest, bin[biggest], np.sum(bin)) logger.debug('Model %s - Average Leaf Size: %d', JOINT_NAMES[joint_id], np.sum(bin) / unique_ids.shape[0]) # Save models to disk folder = 'dl_%s_%d_%d_%d_%d/' % (TRAIN_SET, k_value, samples_leaf, num_samples, xy_offset) if not os.path.exists(os.path.join(model_dir, folder)): os.makedirs(os.path.join(model_dir, folder)) regressor_path = os.path.join(model_dir, folder, 'regressor' + str(joint_id) + '.pkl') L_path = os.path.join(model_dir, folder, 'L' + str(joint_id) + '.pkl') #vectors_path = os.path.join(model_dir, folder, 'vector' + str(joint_id) + '.pkl') #pickle.dump(regressor, open(regressor_path, 'wb')) #pickle.dump(L, open(L_path, 'wb')) joblib.dump(regressor, regressor_path) joblib.dump(L, L_path) return regressor, L
def __clustering(self, X, y=None): """ The clustering procedure of the Optimal Weighted Clustering Gaussian Process. This function should not be called externally """ if self.cluster_method == "k-mean": clusterer = KMeans(n_clusters=self.n_cluster) clusterer.fit(X) self.cluster_label = clusterer.labels_ self.clusterer = clusterer elif self.cluster_method == "tree": print "Warning: specified clustering count might be overwritten" minsamples = int(len(X) / (self.n_cluster + 1)) tree = DecisionTreeRegressor(random_state=0, min_samples_leaf=minsamples) tree.fit(X, y) labels = tree.apply(X) clusters = np.unique(labels) k = len(clusters) print "leafs:", k self.n_cluster = k self.leaf_labels = np.unique(labels) self.cluster_label = labels self.clusterer = tree elif self.cluster_method == "random": r = self.n_sample % self.n_cluster m = (self.n_sample - r) / self.n_cluster self.cluster_label = array(range(self.n_cluster) * m + range(r)) self.clusterer = None shuffle(self.cluster_label) elif self.cluster_method == "GMM": # GMM from sklearn self.clusterer = GMM(n_components=self.n_cluster, n_iter=1000) self.clusterer.fit(X) self.cluster_labels_proba = self.clusterer.predict_proba(X) self.cluster_label = self.clusterer.predict(X) elif self.cluster_method == "fuzzy-c-mean": # Fuzzy C-means from sklearn cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans( X.T, self.n_cluster, 2, error=0.000005, maxiter=10000, init=None ) self.clusterer = cntr # save the centers for cmeans_predict self.cluster_labels_proba = u.T self.cluster_labels_proba = np.array(self.cluster_labels_proba) self.cluster_label = np.argmax(u, axis=0) self.cluster_label = np.array(self.cluster_label) elif self.cluster_method == "flame": # Flame clustering, files are attached print "Warning: specified clustering count will be overwritten with Flame" flameobject = flame.Flame_New() tempdata = X.astype(np.float32) N = len(tempdata) flameobject = flame.Flame_New() flame.Flame_SetDataMatrix(flameobject, tempdata, 0) flame.Flame_DefineSupports(flameobject, self.flame_knn, self.flame_threshold) # knn is number of neighbours cso_count = flameobject.cso_count # print "done, found ", cso_count, " clusters" k = cso_count + 1 #!!! overwrite k here self.n_cluster = k print "clusters:", k flame.Flame_LocalApproximation(flameobject, 500, 1e-6) self.cluster_labels_proba = flame.Print_Clusters(flameobject, (cso_count + 1) * N) flame.Flame_Clear(flameobject) self.cluster_labels_proba = self.cluster_labels_proba.reshape((N, cso_count + 1)) self.clusterer = None # we need to assign something
# scaling is not necessary for decision trees small_dTree = DecisionTreeRegressor(max_depth=2) small_dTree.fit(train_X[["median_income"]], train_y) small_dTree_pred = small_dTree.predict(train_X[["median_income"]]) trees.tree_to_code(small_dTree, ['median_income']) print(f" feature: {small_dTree.tree_.feature}") print(f" child left: {small_dTree.tree_.children_left}") print(f" child left idx 4: {small_dTree.tree_.children_left[4]}") print(f" child right: {small_dTree.tree_.children_right}") print(f" tree value idx 2: {small_dTree.tree_.value[2]}") print(f" ") rss = np.sum((train_y - small_dTree_pred)**2) print(f" rss: {rss}") thresh_node = small_dTree.tree_.threshold[1] samples_in_leaves = small_dTree.apply(train_X[[ "median_income" ]]) # gibt pro Zeile die Zugehörigkeit zum terminal node aus print(samples_in_leaves) print(pd.value_counts((samples_in_leaves))) #use impurity measure? print(small_dTree.tree_.children_left[2]) print(f" child left idx 2: {small_dTree.tree_.children_left[2]}") print(f" tree leaf: {_tree.TREE_LEAF}") trees.depth_first(small_dTree.tree_, 0) print(f" node count: {small_dTree.tree_.node_count}") # total nr of nodes print(f" node impurity idx 0: {small_dTree.tree_.impurity[0]}") print( f" impurity * n samples at idx 0: {small_dTree.tree_.impurity[0]*small_dTree.tree_.n_node_samples[0]}" ) rss_mean = np.sum((train_y - np.mean(train_y))**2) print(f" pred error of mean: {rss_mean}")
meanSquare = np.sqrt(meanSquare[0]) if(linearReg < linearRegBest): linearRegBest = linearReg meanSquareBest = meanSquare print(leftModel) model = [leftModel, rightModel] print(model) return linearReg, meanSquareBest modelTree, meanSquare = ModelTree(); print(modelTree," ",meanSquare) print ("Time taken to build the model: ",datetime.now() - startTime) node_indicator = regressionTree.decision_path(X_test) leave_id = regressionTree.apply(X_test) sample_id = 0 node_index = node_indicator.indices[node_indicator.indptr[sample_id]: node_indicator.indptr[sample_id + 1]] for i in range(n_nodes): if is_leaves[i]: print("%snode=%s leaf node." % (node_depth[i] * "\t", i)) else: print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to " "node %s." % (node_depth[i] * "\t", i, children_left[i], feature[i],
class Breiman_Tree: ''' Main class for Breiman Tree version of active learning algorithm ''' def __init__(self, min_samples_leaf=None, seed=None): self.points = None self.labels = None self.labelled_indices = None self._num_points = 0 self._num_labelled = 0 if seed is None: self.seed = 0 else: self.seed = seed if min_samples_leaf is None: self.min_samples_leaf=1 else: self.min_samples_leaf=min_samples_leaf self.tree = DecisionTreeRegressor(random_state=self.seed,min_samples_leaf=self.min_samples_leaf) self._leaf_indices = [] self._leaf_marginal = [] self._leaf_var = [] self._al_proportions =[] self._leaf_statistics_up_to_date = False self._leaf_proportions_up_to_date = False self._verbose = False def input_data(self, all_data, labelled_indices, labels, copy_data=True): if copy_data: all_data = copy.deepcopy(all_data) labelled_indices = copy.deepcopy(labelled_indices) labels = copy.deepcopy(labels) if len(all_data) < len(labelled_indices): raise ValueError('Cannot have more labelled indicies than points') if len(labelled_indices) != len(labels): raise ValueError('Labelled indicies list and labels list must be same length') if str(type(all_data)) == "<class 'numpy.ndarray'>": if self._verbose: print('Converting all_data to list of lists internally') all_data = all_data.tolist() if str(type(labelled_indices)) == "<class 'numpy.ndarray'>": if self._verbose: print('Converting labelled_indices to list internally') labelled_indices = labelled_indices.tolist() if str(type(labels)) == "<class 'numpy.ndarray'>": if self._verbose: print('Converting labels to list internally') labels = labels.tolist() self.points = all_data self._num_points = len(self.points) self._num_labelled = len(labels) # Making a label list, with None in places where we don't have the label temp = [None] * self._num_points for i,ind in enumerate(labelled_indices): temp[ind] = labels[i] self.labels = temp self.labelled_indices = list(labelled_indices) def fit_tree(self): self.tree.fit(np.array(self.points)[self.labelled_indices,:], np.array(self.labels)[self.labelled_indices]) self._leaf_indices = self.tree.apply(np.array(self.points)) self._leaf_statistics_up_to_date = False def label_point(self, index, value): if self.labels is None: raise RuntimeError('No data in the tree') if len(self.labels) <= index: raise ValueError('Index {} larger than size of data in tree'.format(index)) value = copy.copy(value) index = copy.copy(index) self.labels[index] = value self.labelled_indices.append(index) self._num_labelled += 1 def predict(self, new_points): return(self.tree.predict(new_points)) def calculate_leaf_statistics(self): temp = Counter(self._leaf_indices) self._leaf_marginal = [] self._leaf_var = [] for key in np.unique(self._leaf_indices): self._leaf_marginal.append(temp[key]/self._num_points) temp_ind = [i for i,x in enumerate(self._leaf_indices) if x == key] temp_labels = [x for x in self.labels if x is not None] self._leaf_var.append(utils.unbiased_var(temp_labels)) self._leaf_statistics_up_to_date = True def al_calculate_leaf_proportions(self): if not self._leaf_statistics_up_to_date: self.calculate_leaf_statistics() al_proportions = [] for i, val in enumerate(self._leaf_var): al_proportions.append(np.sqrt(self._leaf_var[i] * self._leaf_marginal[i])) al_proportions = np.array(al_proportions)/sum(al_proportions) self._al_proportions = al_proportions self._leaf_proportions_up_to_date = True def pick_new_points(self, num_samples = 1): if not self._leaf_proportions_up_to_date: self.al_calculate_leaf_proportions() temp = Counter(np.array(self._leaf_indices)[[x for x in range(self._num_points ) if self.labels[x] is None]]) point_proportions = {} for i,key in enumerate(np.unique(self._leaf_indices)): point_proportions[key] = self._al_proportions[i] / max(1,temp[key]) temp_probs = np.array([point_proportions[key] for key in self._leaf_indices]) temp_probs[self.labelled_indices] = 0 temp_probs = temp_probs / sum(temp_probs) # print(sum(temp_probs)) leaves_to_sample = np.random.choice(self._leaf_indices,num_samples, p=temp_probs, replace = False) points_to_label = [] for leaf in leaves_to_sample: possible_points = [x for i,x in enumerate(range(self._num_points) ) if self._leaf_indices[i] ==leaf and self.labels[i] is None] points_to_label.append(np.random.choice(possible_points)) return(points_to_label)
class DecisionTreeSVRRegressor(): #class that hold svr model in the tree leaves #implements all of DecisionTree methods and properties def __init__(self, params_svr = {'kernel': 'rbf', 'epsilon': 0.2}, criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_split=1e-7, presort=False): self.base_tree = DecisionTreeRegressor( criterion=criterion, splitter='best', max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state, presort=presort) self.params_svr = params_svr self.x_leaves = {} self.y_leaves = {} self.svrs_leaves = {} self.tree_ = None def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None): self.base_tree.fit(X, y, sample_weight, check_input, X_idx_sorted) self.tree_ = self.base_tree.tree_ leaves = self.base_tree.apply(X) for x_id in range(0, X.shape[0]): leaf_id = int(leaves[x_id]) if not self.x_leaves.has_key(leaf_id): self.x_leaves[leaf_id] = [] self.y_leaves[leaf_id] = [] self.x_leaves[leaf_id].append(X[x_id]) self. y_leaves[leaf_id].append(y[x_id]) for leaf_id in self.x_leaves.keys(): svr = SVR(**self.params_svr) svr.fit(self.x_leaves[leaf_id], self.y_leaves[leaf_id]) self.svrs_leaves[leaf_id] = svr return self def predict(self, X, check_input=True): y = [] x_leaves = self.base_tree.apply(X) for x in X: leaf = x_leaves[x] svr = self.svrs_leaves[leaf] pred = svr.predict(x)[0] y.append(pred) return y def apply(self, X, check_input=True): return self.base_tree.apply(X) def decision_path(self, X, check_input=True): return self.decision_path.apply(X) def feature_importances_(self): return self.decision_path.feature_importances_() def _validate_X_predict(self, X, check_input=True): return self.base_tree._validate_X_predict(X, check_input )
class DTALE(object): """ A decision tree the learns to make same predictions as the nucleus model classifier, but using interpretable features. Note that the tree is learning using the mask-rcnn model predictions, NOT the gtruth. The idea is to find an interpretable approximation of what the classification component of the model seems to be relying on. We rely on a REGRESSION tree for a more refined approximation of the model behavior. References: ----------- Amgad M, Atteya LA, Hussein H, Mohammed KH, Hafiz E, Elsebaie MA, Mobadersany P, Manthey D, Gutman DA, Elfandy H, Cooper LA. Explainable nucleus classification using Decision Tree Approximation of Learned Embeddings. Bioinformatics. 2021 Sep 29. """ def __init__( self, feats: DataFrame, clusts: DataFrame, savedir: str, pcoln: str = 'pred_categ', ecoln0: str = 'embedding_0', ecoln1: str = 'embedding_1', classes_list: List = None, fitkwargs: Dict = None, ): """ Parameters ---------- feats: DataFrame A dataframe of interpretable features per nucleus. Rows are correspond to nuclei and columns correspond to features. Must have the same index and no of rows as the `clusts` parameter. clusts: DataFrame A dataframe that is indexed by nucleus name or i.d., and has at least three columns, whose names are controlled by the `pcoln`, `ecoln0` and `ecoln1` parameters. The columns encode the nucleus classification labels, first embedding dimension value, and second embedding dimension value. Must have the same index and no of rows as `feats`. savedir: str Directory to save model, figures, and other results. pcoln: str Name of column encoding classification label of nuclei in `clusts`. ecoln0: str Name of column encoding first embedding value for nuclei in `clusts`. ecoln1: str Name of column encoding second embedding value for nuclei in `clusts`. classes_list: List Optional, set of unique classification classes. Extracted automatically if not provided. fitkwargs: Dict kwargs to pass to DecisionTreeRegressor. Default values used in the DTALE paper are used if this parameter is not provided. """ # drop nans clusts = clusts.dropna(axis=0) feats = feats.loc[clusts.index, :] feats = feats.dropna(axis=0) clusts = clusts.loc[feats.index, :] self.feats = feats self.clusts = clusts # some ground work self.pcoln = pcoln self.ecoln0 = ecoln0 self.ecoln1 = ecoln1 _, y = self._getxy() self._e0min = y[:, 0].min() self._e0max = y[:, 0].max() self._e1min = y[:, 1].min() self._e1max = y[:, 1].max() # assign params or defaults self.classes_list = classes_list or list( set(clusts.loc[:, pcoln].tolist())) self.fitkwargs = fitkwargs or { 'random_state': 0, 'min_samples_leaf': 250, # best: 250 'max_depth': 7, # best: 7 } # init attribs self.savedir = savedir self.featnames = np.array(feats.columns) self.model = None self.tree = None self.n_nodes = None self.pred_y_leafs = None self.leafs = None self.nodes = None self.node_leafs = {} self.node_tally = {} def _getxy(self): X = self.feats.values y = self.clusts.loc[:, [self.ecoln0, self.ecoln1]].values return X, y def fit_model(self): """Fit a DTALE model.""" # fit regressor to predict embeddings from NuCLS model self.model = DecisionTreeRegressor(**self.fitkwargs) X, y = self._getxy() self.model.fit(X, y) self.tree = self.model.tree_ # save model for reproducibility with open(opj(self.savedir, 'dectree.pkl'), 'wb') as f: pickle.dump(self.model, f) # # load model # with open(opj(savedir, 'dectree.pkl'), 'rb') as f: # loaded_model = pickle.load(f) # # show tree text # r = export_text(regr, feature_names=list(feats.columns)) # print(r) def apply_model(self): self.n_nodes = self.tree.node_count self.leafs = np.argwhere(self.tree.children_left == -1)[:, 0].tolist() self.nodes = {i for i in range(self.n_nodes)}.difference(self.leafs) # Apply to training data X, _ = self._getxy() self.pred_y_leafs = self.model.apply(X) # self.pred_y_vals = self.tree.value[self.pred_y_leafs, :, 0] def _find_leaves_in_subtree(self, root, subtrees): """find all the leaves enclosed within a subtree.""" leafs = [] def _traverse(node): # dynamic programming if node in subtrees: leafs.extend(subtrees[node]) return subtrees[node] = [] children = ( self.tree.children_left[node], self.tree.children_right[node], ) if children[0] == -1: leafs.append(node) subtrees[node].append(node) return for child in children: _traverse(child) _traverse(node=root) subtrees[root] = leafs return subtrees def set_leafs_for_all_subtrees(self): """Get all subleafs enclosed within each node subtree.""" # traverse from bottom up for dynamic programming speedup for nd in range(self.n_nodes - 1, -1, -1): self._find_leaves_in_subtree(root=nd, subtrees=self.node_leafs) def set_node_tally(self): """ Get a tally of the number of points from each class (as determined by the NuCLS model final prediction) for each node. """ self.node_tally = { leaf: Counter(self.clusts.loc[self.pred_y_leafs == leaf, self.pcoln].to_list()) for leaf in self.leafs } for node, nlfs in self.node_leafs.items(): if node in self.leafs: continue self.node_tally[node] = self.node_tally[nlfs[0]] for nlf in nlfs[1:]: self.node_tally[node] += self.node_tally[nlf] def _get_best_node_for_class(self, cls, metric): """ For one class, find the cluster (node) which overlaps the most with the predictions from the NuCLS model """ best_node = None best_stats = {metric: -1. if metric == 'MCC' else 0.} for node in self.nodes: innode = 0 + np.in1d(self.pred_y_leafs, self.node_leafs[node]) incls = 0 + (self.clusts.loc[:, self.pcoln] == cls).values stats = calc_stats_simple( TP=np.sum(innode + incls == 2), FP=np.sum(innode - incls == 1), TN=np.sum(innode + incls == 0), FN=np.sum(innode - incls == -1), ) stats['MCC'] = matthews_corrcoef(y_true=incls, y_pred=innode) if stats[metric] > best_stats[metric]: best_node = node best_stats.update(stats) return best_node, best_stats def _get_best_node_for_each_class(self, metric='precision'): """ For each class, find the cluster (node) which best fits/explains predictions from the NuCLS model IMPORTANT NOTE: The classes are INDEPENDENT of each other. So an early "tumor" node does NOT exclude the descendent "mitotic" node. This is EXPECTED and cannot be overcome because the nodes are not pure .. even the downstream "mitotic" node contains some tumor leafs, so excluding it would reduce recall of the "tumor" node. Best way is to think of these paths as being independent for different classes. """ best_nodes = {} best_stats = {} for cls in self.classes_list: best_nodes[cls], best_stats[cls] = self._get_best_node_for_class( cls, metric=metric) return best_nodes, best_stats def _trace_from_node_to_root(self, node): trace = [node] direction = [0] current_node = node keep_going = True while keep_going: left = np.argwhere(self.tree.children_left == current_node) right = np.argwhere(self.tree.children_right == current_node) if len(left) > 0: current_node = left[0, 0] trace.append(current_node) direction.append(-1) elif len(right) > 0: current_node = right[0, 0] trace.append(current_node) direction.append(1) else: keep_going = False return trace, direction def save_dectree_traces(self, best_nodes, best_stats, postfix=''): """Save decision tree traces for relevant classes.""" node_trace = {} direction_trace = {} feat_trace = {} impurity_trace = {} nsize_trace = {} thresh_trace = {} nice_trace = {} for cls in self.classes_list: # track from node to root ntrace, dtrace = self._trace_from_node_to_root(best_nodes[cls]) ntrace, dtrace = ntrace[::-1], dtrace[::-1] node_trace[cls], direction_trace[cls] = ntrace, dtrace # map nodes to feature names and thresholds feat_trace[cls] = self.featnames[ self.tree.feature[ntrace]].tolist() impurity_trace[cls] = self.tree.impurity[ntrace].tolist() nsize_trace[cls] = self.tree.n_node_samples[ntrace].tolist() thresh_trace[cls] = self.tree.threshold[ntrace].tolist() # render into nice text descr = '\nDECISIONS:\n' descr += "--------------\n" for nix in range(len(ntrace) - 1): dhere = ' '.join([ feat_trace[cls][nix], '<=' if dtrace[nix] == -1 else '>', '%.1f' % thresh_trace[cls][nix] ]) descr += dhere + '\n' descr += f'\nSTATS:\n' descr += "--------------\n" descr += '\n'.join( [f'{st}: %.2f' % stv for st, stv in best_stats[cls].items()]) + '\n' nice_trace[cls] = descr # parse into a dict and pickle with open(opj(self.savedir, f'dectree_traces{postfix}.pkl'), 'wb') as f: pickle.dump( { 'features': feat_trace, 'thresholds': thresh_trace, 'impurity': impurity_trace, 'nodes': node_trace, 'direction': direction_trace, 'node_n_samples': nsize_trace, 'nice': nice_trace, # How well the "chosen" traces from our decision tree # fit/explain the NuCLS model predictions. For example, a # precision of 0.9 for 'tumor' means that 90% of the nuclei # predicted as 'tumor' by our decision tree are also predicted # as 'tumor' by the NuCLS model. 'fit_stats_to_NuCLS_model': best_stats, }, f) # save nice rendered text for relevant parts of tree with open(opj(self.savedir, f'dectree_nice{postfix}.txt'), 'w') as f: for cls in self.classes_list: f.write("***********************************\n" f"{cls}\n" "***********************************\n") f.write(nice_trace[cls] + '\n') def visualize_decision_tree_nodes(self, best_nodes, postfix=''): """Visualize the learned decision tree nodes.""" plt.figure(figsize=(7, 7)) # scatter actual points from NuCLS model in background _, y = self._getxy() plt.scatter(y[:, 0], y[:, 1], c='beige', alpha=0.6, s=4, edgecolors='none') # trace the learned decision tree for node in range(self.tree.node_count): if self.tree.children_left[node] == -1: continue me = self.tree.value[node, :, 0] clt = self.tree.value[self.tree.children_left[node], :, 0] crt = self.tree.value[self.tree.children_right[node], :, 0] plt.plot( [clt[0], me[0], crt[0]], [clt[1], me[1], crt[1]], color='gray', marker='.', linestyle='-', linewidth=0.5, markersize=3, alpha=0.5, ) # highligh root node me = self.tree.value[0, :, 0] plt.scatter([me[0]], [me[1]], color='k', s=30, alpha=1., edgecolors='k') # color best (class-representative) nodes by class for cls, node in best_nodes.items(): me = self.tree.value[node, :, 0] # color the trace along the decision tree till best node trace, _ = self._trace_from_node_to_root(node) for ndi in range(len(trace) - 1): clt = self.tree.value[trace[ndi], :, 0] crt = self.tree.value[trace[ndi + 1], :, 0] plt.plot( [clt[0], crt[0]], [clt[1], crt[1]], color='k', alpha=1., marker='o', markersize=2.5, linestyle='-', linewidth=1.3, ) # highlight actual chosen best node color = np.array(VisConfigs.CATEG_COLORS[cls])[None, :] / 255. plt.scatter([me[0]], [me[1]], color=color, s=150, alpha=1., edgecolors='none') plt.xlim(self._e0min, self._e0max) plt.ylim(self._e1min, self._e1max) plt.title(f'DTALE nodes ({postfix})', fontsize=14, fontweight='bold') # plt.show() # plt.savefig(opj(self.savedir, f'dectree{postfix}.svg')) plt.savefig(opj(self.savedir, f'dectree{postfix}.png')) def visualize_decision_tree_classes(self, best_nodes, classes_list=None, restrict_to_pcateg=False, exclude_leafs=None, savedir=None, postfix=''): """Visualize embeddings, colors by class predicted by decision tree.""" classes_list = classes_list or self.classes_list savedir = savedir or self.savedir init_point_size = 10. point_size_ds = 1. alphas = [0.8, 0.5] _, y = self._getxy() plt.figure(figsize=(7, 7)) point_size = init_point_size alphas = np.linspace(alphas[0], alphas[1], len(classes_list)) # keep track of plotted indices to be able to exclude downstream # nodes when plotting upstream ones when relevant kept_idxs = [] for clno, cls in enumerate(classes_list): # maybe restrict to leafs predicted as a particular class by NuCLS keep1 = None if restrict_to_pcateg: keep1 = (self.clusts.loc[:, 'pred_categ'] == cls).values # restrict to downstream leafs to node of interest keep2 = np.in1d(self.pred_y_leafs, self.node_leafs[best_nodes[cls]]) # noqa if keep1 is None: keep = keep2 else: keep = keep1 & keep2 # maybe exclude certain leafs if exclude_leafs is not None: keep[exclude_leafs] = False # keep track of kept idxes kept_idxs.extend(np.argwhere(keep)[:, 0].tolist()) # now restrict to leaves of interes y_subset = y[keep, :] # plot plt.scatter(y_subset[:, 0], y_subset[:, 1], c=np.array(VisConfigs.CATEG_COLORS[cls])[None, :] / 255., alpha=alphas[clno], s=point_size, edgecolors='none') point_size = point_size_ds * point_size plt.xlim(self._e0min, self._e0max) plt.ylim(self._e1min, self._e1max) plt.title(f'DTALE decisions ({postfix})', fontsize=14, fontweight='bold') # plt.show() # plt.savefig(opj(savedir, f'dectreeCol{postfix}.svg')) plt.savefig(opj(savedir, f'dectreeCol{postfix}.png')) return kept_idxs def save_and_plot_optimized_decision_paths(self): """ Use different metrics to emphasize different things learned: - F1 score: typical case (most tumor nuclei in the dataset) VERSUS ... - precision: most discriminative case (textbook examples). Using F-1 helps us find nodes in our decision tree that correlate to the process used by the NuCLS model when making its "average" decision, whereas using the precision score allows us to understand when does the model decide that it's "sure" something is, say, a tumor nucleus. """ for metric in ['F1', 'precision']: print(f' Optimized for {metric}') # for each class, find the cluster (node) which best fits/explains # predictions from the NuCLS model (determined by metric of choice) best_nodes, best_stats = \ self._get_best_node_for_each_class(metric=metric) # save decision tree traces for relevant classes kwargs = { 'best_nodes': best_nodes, 'postfix': f'_OptimizedFor{metric}', } self.save_dectree_traces(best_stats=best_stats, **kwargs) # visualize tree self.visualize_decision_tree_nodes(**kwargs) # color points associated with the best node for each class _ = self.visualize_decision_tree_classes(**kwargs) def plot_step_by_step_paths(self): # read precision traces with open( opj(self.savedir, f'dectree_traces_OptimizedForprecision.pkl'), 'rb') as f: # noqa traces = pickle.load(f) # for each class, plot one node at a time, excluding downstream nodes for cls in self.classes_list: savedir = opj(self.savedir, cls) maybe_mkdir(savedir) classes_list = [cls] exclude_idxs = [] for nix, node in enumerate(traces['nodes'][cls][::-1]): excl = self.visualize_decision_tree_classes( best_nodes={cls: node}, classes_list=classes_list, restrict_to_pcateg=True, exclude_leafs=exclude_idxs, savedir=savedir, postfix=f'_{cls}_nodeidx-{nix}({node})', ) exclude_idxs.extend(excl) def run_sequence(self): """Main workflow.""" print('DTALE: Fitting model ...') self.fit_model() self.apply_model() print('DTALE: Parsing tree ...') self.set_leafs_for_all_subtrees() # self.set_node_tally() print('DTALE: Saving and plotting optimized decision paths ...') self.save_and_plot_optimized_decision_paths() self.plot_step_by_step_paths()
class CARTMethod(Method): def __init__(self, dtype, smoothing=False, proper=False, minibucket=5, random_state=None, *args, **kwargs): self.dtype = dtype self.smoothing = smoothing self.proper = proper self.minibucket = minibucket self.random_state = random_state if self.dtype in CAT_COLS_DTYPES: self.cart = DecisionTreeClassifier( min_samples_leaf=self.minibucket, random_state=self.random_state) if self.dtype in NUM_COLS_DTYPES: self.cart = DecisionTreeRegressor(min_samples_leaf=self.minibucket, random_state=self.random_state) def fit(self, X_df, y_df): if self.proper: X_df, y_df = proper(X_df=X_df, y_df=y_df, random_state=self.random_state) X_df, y_df = self.prepare_dfs(X_df=X_df, y_df=y_df, normalise_num_cols=False, one_hot_cat_cols=True) if self.dtype in NUM_COLS_DTYPES: self.y_real_min, self.y_real_max = np.min(y_df), np.max(y_df) X = X_df.to_numpy() y = y_df.to_numpy() self.cart.fit(X, y) # save the y distribution wrt trained tree nodes leaves = self.cart.apply(X) leaves_y_df = pd.DataFrame({'leaves': leaves, 'y': y}) self.leaves_y_dict = leaves_y_df.groupby('leaves').apply( lambda x: x.to_numpy()[:, -1]).to_dict() def predict(self, X_test_df): X_test_df, _ = self.prepare_dfs(X_df=X_test_df, normalise_num_cols=False, one_hot_cat_cols=True, fit=False) # predict the leaves and for each leaf randomly sample from the observed values X_test = X_test_df.to_numpy() leaves_pred = self.cart.apply(X_test) y_pred = np.zeros(len(leaves_pred), dtype=object) leaves_pred_index_df = pd.DataFrame({ 'leaves_pred': leaves_pred, 'index': range(len(leaves_pred)) }) leaves_pred_index_dict = leaves_pred_index_df.groupby( 'leaves_pred').apply(lambda x: x.to_numpy()[:, -1]).to_dict() for leaf, indices in leaves_pred_index_dict.items(): y_pred[indices] = np.random.choice(self.leaves_y_dict[leaf], size=len(indices), replace=True) if self.smoothing and self.dtype in NUM_COLS_DTYPES: y_pred = smooth(self.dtype, y_pred, self.y_real_min, self.y_real_max) return y_pred
class DecisionTreeCounterfactual: """ Counterfactual estimation using a decision tree. Given explanatory variables X, target variable y and treatment variable W, this class implements an individual counterfactual estimation model. We can break down the process in three steps: 1 - model step) Fit a decision tree to X and y 2 - comparison step) at each of the tree's leaves, compare W and y to determine the counterfactuals for the leaf 3 - prediction step) assign new samples to a leaf, and predict counterfactuals Parameters ---------- model : object, optinal (default=None) Tree-based model which implements sklearn's API, particularly the .apply() method. Must be already configured. If None, model will be DecisionTreeRegressor(min_samples_leaf=100). min_sample_effect : int, optional (default=10) The minimum number of samples in a neighborhood to deem a counterfactual estimate valid, for a given W. If there's less treated/untreated elements than min_sample_effect, the counterfactual will be NaN. save_explanatory : bool, optional (default=False) Save explanatory variables for explaining predictions. May cause large memory overhead. random_state : int, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. """ # initializing def __init__(self, model=None, min_sample_effect=10, save_explanatory=False, random_state=None): # storing model if model == None: self.model = DecisionTreeRegressor(min_samples_leaf=100) else: self.model = model # storing variables self.min_sample_effect = int(min_sample_effect) self.random_state = random_state self.save_explanatory = save_explanatory def _test_treatment_linear_discriminative_power(self, leaf_df): """ Using data from elements on leaf, test if treatments are randomly assigned by using a linear model to predict it. Parameters ---------- leaf_df : pd.DataFrame Training datafarme with features (X), treatment assignments (W) and target (y) Returns ------- return : float Average AUC (if multiclass) of treatment assignment predictive model for leaf """ # organizing and standardizing data for model W_leaf = leaf_df['W'] X_leaf = leaf_df.drop(['W', 'y'], axis=1) X_leaf = StandardScaler().fit_transform(X_leaf) # fitting model lr = LogisticRegression(solver='lbfgs') lr.fit(X_leaf, W_leaf) # predicting W_predicted = lr.predict_proba(X_leaf) # if we have a single treatment treat as binary # classification problem, if not do nothing and # roc_auc_score function will take care of it if W_predicted.shape[1] == 2: W_predicted = W_predicted[:, 1] # computing score (avg. AUC) score = roc_auc_score(W_leaf, W_predicted, multi_class='ovr', average='weighted') return score def _compute_treatment_confounding(self, filtered_train_df): """ Apply tests to determine if treatments are randomly assigned for all leaves Parameters ---------- filtered_train_df : pd.DataFrame Subset of training dataframe for elements on leaves that effects are valid (given min_sample_effect parameter) Returns ------- confounding_df: pd.DataFrame Dataframe with confouding scores for each leaf """ # just apply _test_treatment_linear_discriminative_power # for all leaves confounding_df = (filtered_train_df.groupby('leaf').apply( self._test_treatment_linear_discriminative_power).to_frame( name='confounding_score')) # using multi index to work in final dataframe confounding_df.columns = pd.MultiIndex.from_tuples([ ('confounding_score', '') ]) return confounding_df def _compute_leaf_counterfactuals(self, filtered_train_df): """ Compute counterfactuals for each valid leaf Parameters ---------- filtered_train_df : pd.DataFrame Subset of training dataframe for elements on leaves that effects are valid (given min_sample_effect parameter) Returns ------- leaf_counterfactual_df : pd.DataFrame Dataframe with expected outcomes for each treatment """ # computing avg outcomes for each treatment leaf_counterfactual_df = (filtered_train_df.pivot_table( values='y', columns='W', index='leaf').reset_index().set_index('leaf')) # fomatting column names leaf_counterfactual_df.columns = (pd.MultiIndex.from_product( [['avg_outcome'], leaf_counterfactual_df.columns], names=[None, 'W'])) return leaf_counterfactual_df def _compute_feature_dispersion(self, train_df): """ Computes feature dispersion between treatments in leaves, to help diagnosing if effects are valid Parameters ---------- train_df : pd.DataFrame Training dataframe, as stored using the "save_explanatory=True" parameter Returns ------- feat_dispersion : pd.DataFrame Difference in percentiles between elements with different treatment in each leaf. """ # computing rank (percentiles) for each feature # and pivot by treatment to show user feat_percentiles_pivot = (train_df.set_index(['leaf', 'W']).drop( ['y'], axis=1).rank(pct=True).pivot_table(index='leaf', columns='W').dropna()) # putting levels to same column to match final output # # add prefix to first level level_0 = ('percentile_' + feat_percentiles_pivot.columns.get_level_values(0)) # second level stays the same level_1 = (feat_percentiles_pivot.columns.get_level_values(1)) # applying to df feat_percentiles_pivot.columns = (pd.MultiIndex.from_arrays( [level_0, level_1])) return feat_percentiles_pivot # fit model def fit(self, X, W, y, verbose=0): """ Get counterfactual estimates given explanatory variables X, treatment variable W and target y This method will fit a decision tree from X to y and store outcomes given distinct W values at each of its leaves Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. W : array-like, shape = [n_samples] Treatment variable. The model will try to estimate a counterfactual outcome for each unique value in this variable. Should not exceed 10 unique values. y: array-like, shape = [n_samples] Target variable. verbose : int, optional (default=0) Verbosity level. Returns ------- self: object """ # checking if W has too many unique values if len(np.unique(W)) > 10: raise ValueError( 'More than 10 unique values for W. Too many unique values will make the process very expensive.' ) # fitting the model self.model.fit(X, y) # storing column names self.col_names = X.columns # saving explanatory variables, if applicable if self.save_explanatory: self.train_df = X.assign(leaf=self.model.apply(X), W=W, y=y) # initializing a df with counterfactuals for each leaf self.leaf_counterfactual_df = (pd.DataFrame({ 'leaf': self.model.apply(X), 'y': y, 'W': W }).assign(count=1).groupby(['leaf', 'W']).sum()) # making estimates based on small samples invalid invalid_estimate_mask = (self.leaf_counterfactual_df['count'] < self.min_sample_effect) self.leaf_counterfactual_df.loc[invalid_estimate_mask, 'y'] = np.nan # correcting y by taking average self.leaf_counterfactual_df['y'] = ( self.leaf_counterfactual_df['y'] / self.leaf_counterfactual_df['count']) # return self return self # method for predicting counterfactuals def predict(self, X, verbose=0): """ Predict counterfactual outcomes for X. This method runs new samples through the tree, and predicts counterfactuals given which leaf new samples ended up into Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. verbose : int, optional (default=0) Verbosity level. Returns ------- counterfactual_df : pd.DataFrame Counterfactual outcomes per sample. """ # getting decision tree cluster assignments leaves_score = pd.DataFrame({ 'leaf': self.model.apply(X), 'id': X.index }) # to get counterfactual df we just need to join leaves_test with leaf_counterfactual_df counterfactual_df = (leaves_score.merge( self.leaf_counterfactual_df.reset_index(), how='left').pivot(values='y', columns='W', index='id')) # correcting columns counterfactual_df.columns = (pd.MultiIndex.from_product( [ ['y_hat'], counterfactual_df.columns, ], names=[None, 'W'])) # returning counterfactual df return counterfactual_df # running CV for model parameters def get_cross_val_scores(self, X, y, scoring=None, verbose=0): """ Estimate model generalization power with 5-fold CV. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] Data with explanatory variables, with possible confounders of treatment assignment and effect. y: array-like, shape = [n_samples] Target variable. scoring : string, callable or None, optional, default: None Scoring method for sklearn's cross_val_score function: A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)`` which should return only a single value. Similar to :func:`cross_validate` but only a single metric is permitted. If None, the estimator's default scorer (if available) is used. verbose : int, optional (default=0) Verbosity level for sklearn's function cross_val_score. Returns ------- scores : array of float, shape=(len(list(cv)),) Array of scores of the estimator for each run of the cross validation. """ # CV method kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state) # generating validation predictions scores = cross_val_score(self.model, X, y, cv=kf, scoring=scoring, verbose=verbose) # calculating result return scores def run_leaf_diagnostics(self): """ Run leaf diagnostics, showing counfounding score, feature distribuitions and counterfactuals for each leaf. Returns ------- leaf_diagnostics_df : pd.DataFrame Dataframe with leaf diagnostics """ # first, we calculate only where effects are valid # # effect is invalid on leaves marked with nan # or leaves that only have one kind of assignment mask_nan = self.leaf_counterfactual_df['y'].isnull() mask_single_assignment = self.leaf_counterfactual_df.groupby( 'leaf').size() == 1 # joining masks and getting invalid leaves mask_invalid_effect = mask_nan | mask_single_assignment invalid_leaves = self.leaf_counterfactual_df.loc[ mask_invalid_effect].index.get_level_values('leaf').values # filtering train df out of invalid leaves mask_invalid_leaves = self.train_df['leaf'].isin(invalid_leaves) filtered_train_df = self.train_df.loc[~mask_invalid_leaves] # then, we calculate quantities like # # counfounding, dispersion and counterfactuals # # for each leaf, so we can perform criticism # # computing discriminative power confounding_df = self._compute_treatment_confounding(filtered_train_df) # computing leaf effects leaf_counterfactual_df = self._compute_leaf_counterfactuals( filtered_train_df) # computing feature dispersion feat_percentiles_df = self._compute_feature_dispersion(self.train_df) # leaf diagnostics df dfs = [leaf_counterfactual_df, feat_percentiles_df, confounding_df] leaf_diagnostics_df = pd.concat(dfs, axis=1, join='inner', levels=[0, 1]) return leaf_diagnostics_df # method for explaning predictions def explain(self, sample): """ Explain predcitions of counterfactual outcomes for one sample. This method shows diagnostics and comparables so you can trust and explain counterfactual predictions to others Parameters ---------- sample : array-like or sparse matrix of shape = [1, n_features] Sample that you want to get explanations for Returns ------- comparables_table : pd.DataFrame Table of comparable elements. """ # checking which leaf sample is assigned to sample_leaf = self.model.apply(sample) # querying comparables if self.save_explanatory: comparables_table = (self.train_df.query( 'leaf == {}'.format(sample_leaf)).drop('leaf', axis=1)) else: raise ValueError( 'Model did not store training samples to get explanations from. Setting save_explanatory=True will solve the issue' ) # returning comparables table return comparables_table