def test_regress_forest():
    """ testing Random forests regression predict function """
    n_trees = 4
    boston = load_boston()
    X = boston.data
    y = boston.target

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)

    #    X_train = np.array([range(1,4),range(4,7)])
    #    y_train = np.array([9,5])
    #    X_test = X_train
    #    y_test = y_train
    print('Single regression tree test : ')
    estimator = DecisionTreeRegressor()
    estimator.fit(X_train, y_train)
    y_pred_dt = estimator.predict(X_test)

    node_indicator = estimator.decision_path(X_train)
    mean_vals, _ = get_node_means(node_indicator, y_train)

    test_leaves_id = estimator.apply(X_test)
    y_pred_mine_dt = mean_vals[test_leaves_id]
    diff = np.linalg.norm(y_pred_dt - y_pred_mine_dt)
    print('Tree predictions diff :' + repr(diff))

    print('Regression Forest Test : ')
    forest = get_models('RandomForest', 'regress')
    forest.set_params(n_estimators=n_trees)
    forest.fit(X_train, y_train)
    y_pred_all = np.zeros(shape=(len(y_test)))
    n_samples = X_train.shape[0]
    indicator, n_nodes_ptr = forest.decision_path(X_train)
    for t, estimator in enumerate(forest):
        t_idx = _generate_sample_indices(estimator.random_state, n_samples)
        y_tree_predict = estimator.predict(X_test)
        print('Num nodes = ' + repr(estimator.tree_.node_count))
        node_indicator = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]]
        #        node_indicator = estimator.decision_path(X_train)
        mean_vals, _ = get_node_means(node_indicator, y_train[t_idx])
        leaves_id = estimator.apply(X_test)
        y_tree_mine = mean_vals[leaves_id]
        diff = np.linalg.norm(y_tree_predict - y_tree_mine)
        #        print(y_tree_predict, y_tree_mine)
        print('Tree#' + repr(t) + ': Diff = ' + repr(diff))
        y_pred_all += y_tree_mine
    y_pred_rf = forest.predict(X_test)
    y_pred_mine_rf = y_pred_all / n_trees
    diff = np.linalg.norm(y_pred_rf - y_pred_mine_rf)
    print('Forest predictions difference :' + repr(diff))
    print('#BUG#-->Trees in the forest dont match my tree predictions')
    return
Exemple #2
0
    def fit(self, X, rels, qids):
        n_rows = np.shape(X)[0]
        F = np.zeros(n_rows)  # base model i.e., F(x_i) = o_i
        for m in range(self.num_trees):
            print(f'building {m}-th tree...')
            Lambda = np.array([])
            Omega = np.array([])
            for q in np.unique(qids):
                rels_q = rels[q == qids]
                F_q = F[q == qids]
                Lambda_q, Omega_q = self._calc_results(q, rels_q, F_q)
                Lambda = np.append(Lambda, Lambda_q)
                Omega = np.append(Omega, Omega_q)

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, Lambda)
            self.trees.append(tree)
            leaves = tree.apply(X)  # get R_jm to which x_i maps
            for leaf in np.unique(leaves):
                # compute scalar gamma
                I = (leaves == leaf)
                gamma = np.sum(Lambda[I]) / (np.sum(Omega[I]) + self.eps)
                # save gamma
                self.gamma[m, leaf] = gamma
                # improve the model
                F += self.lr * I * gamma
            # evaluate current training NDCGs
            self.evaluate(X, rels, qids)
def decision_tree_regressor(X, y, labels):

    regressor = DecisionTreeRegressor(max_depth=3)
    regressor.fit(X, y)

    estimates_z = regressor.predict(X)
    leaves = regressor.apply(X)

    leaves_hash = np.zeros(np.max(leaves) + 1)
    for i in range(len(y)):
        if (estimates_z[i] - y[i]) > 0.05 and estimates_z[i] > 0.6 and y[i] > 0:
            # print estimates_z[i]
            # print y[i]
            # print estimates_z[i]-y[i]
            # print ((estimates_z[i]-y[i])>0.1 and estimates_z[i]>0 and y[i]>0)
            # print leaves[i]
            leaves_hash[leaves[i]] += 1
            # print leaves_hash[leaves[i]]
        else:
            leaves_hash[-1] += 1

    # print regressor.tree_.decision_path(X)
    print regressor.tree_.feature
    print regressor.tree_.threshold
    print leaves_hash
    print regressor.feature_importances_

    visualize_tree(regressor.tree_, labels)
    return estimates_z
Exemple #4
0
 def __clustering(self, X, y=None):
     """
     The clustering procedure of the Optimal Weighted Clustering Gaussian 
     Process. This function should not be called externally
     """
     
     if self.cluster_method == 'k-mean':
         clusterer = KMeans(n_clusters=self.n_cluster)
         clusterer.fit(X)
         self.cluster_label = clusterer.labels_
         self.clusterer = clusterer
     elif self.cluster_method == 'tree':
         print("Warning: specified clustering count might be overwritten")
         minsamples = int(len(X)/(self.n_cluster+1))
         tree = DecisionTreeRegressor(random_state=0,min_samples_leaf=minsamples)
         tree.fit(X,y)
         labels = tree.apply(X)
         clusters = np.unique(labels)
         k = len(clusters)
         print("leafs:",k)
         self.n_cluster = k
         self.leaf_labels = np.unique(labels)
         self.cluster_label = labels
         self.clusterer = tree
     elif self.cluster_method == 'random':
         r = self.n_sample % self.n_cluster
         m = (self.n_sample - r) / self.n_cluster
         self.cluster_label = array(list(range(self.n_cluster)) * m + list(range(r)))
         self.clusterer = None
         shuffle(self.cluster_label)
     elif self.cluster_method == 'GMM':    #GMM from sklearn
         self.clusterer = GMM(n_components=self.n_cluster, n_iter=1000)
         self.clusterer.fit(X)
         self.cluster_labels_proba = self.clusterer.predict_proba(X)
         self.cluster_label = self.clusterer.predict(X)
     elif self.cluster_method == 'fuzzy-c-mean': #Fuzzy C-means from sklearn
         cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X.T, self.n_cluster, 2, error=0.000005, maxiter=10000, init=None)
         self.clusterer = cntr #save the centers for cmeans_predict
         self.cluster_labels_proba = u.T
         self.cluster_labels_proba = np.array(self.cluster_labels_proba)
         self.cluster_label = np.argmax(u, axis=0)
         self.cluster_label = np.array(self.cluster_label)
     elif self.cluster_method == 'flame':  #Flame clustering, files are attached
         print("Warning: specified clustering count will be overwritten with Flame")
         flameobject = flame.Flame_New()
         tempdata  = X.astype(np.float32)
         N = len(tempdata)
         flameobject = flame.Flame_New()
         flame.Flame_SetDataMatrix( flameobject, tempdata,  0 )
         flame.Flame_DefineSupports( flameobject, self.flame_knn, self.flame_threshold ) #knn is number of neighbours
         cso_count = flameobject.cso_count 
         #print "done, found ", cso_count, " clusters"
         k = cso_count+1 #!!! overwrite k here
         self.n_cluster = k
         print("clusters:",k)
         flame.Flame_LocalApproximation( flameobject, 500, 1e-6 )
         self.cluster_labels_proba = flame.Print_Clusters(flameobject, (cso_count+1)*N )
         flame.Flame_Clear(flameobject)
         self.cluster_labels_proba = self.cluster_labels_proba.reshape(( N,cso_count+1 ))
         self.clusterer = None #we need to assign something
Exemple #5
0
    def time_zone(self, var, correct):
        """Automatic detection of which files might have had their time zone incorrectly set. Not guaranteed to work, and certianly not with only a few input files. Based on computing the phase of the daily cycle by projection onto a single daily complex exponential. The correction amount is fixed for now to the 5 hour difference between LA and Greenland.

        :param var: the input :class:`~pandas.DataFrame` of the type constructed for the :attr:`.var` attribute
        :param correct: either ``True`` or ``False`` to indicate whether correction should be attempted, or a list of integer column indexes corresponding to the columns in :attr:`.var` which should be corrected
        :type correct: :obj:`bool` or :obj:`list`
        :returns: a corrected DataFrame with added :class:`~pandas.MultiIndex` level containing the time correction in hours
        :rtype: :class:`~pandas.DataFrame`

        """
        if correct is False:
            return pd.concat((var, ),
                             1,
                             keys=[0],
                             names=['time_adj'] + var.columns.names)
        elif isinstance(correct, list):
            a = var.iloc[:correct]
            b = var.drop(a.columns, 1)
        else:
            phase = var.apply(self.phase, 0)
            i = np.arange(var.shape[1]).reshape((-1, 1))
            tr = DecisionTreeRegressor(max_leaf_nodes=2).fit(i, phase)
            cl = tr.apply(i)
            a = var.iloc[:, cl == 1]
            b = var.iloc[:, cl == 2]
        a.index = a.index + pd.Timedelta(5, 'h')
        print(
            "\nThe following files' timestamps have been changed by 5 hours:\n"
        )
        for f in a.columns.get_level_values('file'):
            print(f)
        return pd.concat((a, b),
                         1,
                         keys=[5, 0],
                         names=['time_adj'] + var.columns.names)
Exemple #6
0
def get_linjie_matrix(x_train,y_train):
    all_all_num = []
    for rr in range(y_train.shape[1]):
        delta_loss = y_train[:,rr].copy()
        all_result = []
        all_weight= []
        num_epch = config.all_config[text_file].get('adjacency_num')
        for qq in range(num_epch):
            estimator = DecisionTreeRegressor(max_depth=config.all_config[text_file].get('adjacency_cart_depth')).fit(x_train,delta_loss)
            leave_id = estimator.apply(x_train)
            result = []
            for i in range(len(leave_id)):
                temp_one = []
                for j in range(len(leave_id)):
                    if (leave_id[i] == leave_id[j]):
                        temp_one.append(1)
                    else:
                        temp_one.append(0)
                result.append(temp_one)
            result = np.array(result)
            
            pre = estimator.predict(x_train)
            delta_loss = delta_loss-pre
            all_weight.append(sum(abs(delta_loss)))
            all_result.append(result)
        all_num = (np.sum(all_weight)-all_weight[0])/np.sum(all_weight)*all_result[0]
        for qq in range(1,num_epch):
            all_num += (np.sum(all_weight)-all_weight[qq])/np.sum(all_weight)*all_result[qq]
        all_all_num.append(all_num)
    cos_result = target_cos(y_train)
    return all_all_num,cos_result
Exemple #7
0
def train(joint_id, X, y, model_dir, min_samples_leaf=400, load_models=args.load_model):
    """Trains a regressor tree on the unit directions towards the joint.

    @params:
        joint_id : current joint id
        X : samples feature array (N x num_samples x num_feats)
        y : samples unit direction vectors (N x num_samples x 3)
        min_samples_split : minimum number of samples required to split an internal node
        load_models : load trained models from disk (if exist)
    """
    logger.debug('Start training %s model...', JOINT_NAMES[joint_id])

    regressor_path = os.path.join(model_dir, 'regressor' + str(joint_id) + '.pkl')
    L_path = os.path.join(model_dir, 'L' + str(joint_id) + '.pkl')

    # Load saved model from disk
    if load_models and (os.path.isfile(regressor_path) and os.path.isfile(L_path)):
        logger.debug('Loading model %s from files...', JOINT_NAMES[joint_id])

        regressor = pickle.load(open(regressor_path, 'rb'))
        L = pickle.load(open(L_path, 'rb'))
        return regressor, L

    X_reshape = X.reshape(X.shape[0] * X.shape[1], X.shape[2]) # (N x num_samples, num_feats)
    y_reshape = y.reshape(y.shape[0] * y.shape[1], y.shape[2]) # (N x num_samples, 3)

    # Count the number of valid (non-zero) samples
    valid_rows = np.logical_not(np.all(X_reshape == 0, axis=1)) # inverse of invalid samples
    logger.debug('Model %s - Valid samples: %d / %d', JOINT_NAMES[joint_id], X_reshape[valid_rows].shape[0], X_reshape.shape[0])

    # Fit decision tree to samples
    regressor = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf)
    regressor.fit(X_reshape[valid_rows], y_reshape[valid_rows])

    L = stochastic(regressor, X_reshape, y_reshape)

    # Print statistics on leafs
    leaf_ids = regressor.apply(X_reshape)
    bin = np.bincount(leaf_ids)
    unique_ids = np.unique(leaf_ids)
    biggest = np.argmax(bin)
    smallest = np.argmin(bin[bin != 0])

    logger.debug('Model %s - # Leaves: %d', JOINT_NAMES[joint_id], unique_ids.shape[0])
    logger.debug('Model %s - Smallest Leaf ID: %d, # Samples: %d/%d', JOINT_NAMES[joint_id], smallest, bin[bin != 0][smallest], np.sum(bin))
    logger.debug('Model %s - Biggest Leaf ID: %d, # Samples: %d/%d', JOINT_NAMES[joint_id], biggest, bin[biggest], np.sum(bin))
    logger.debug('Model %s - Average Leaf Size: %d', JOINT_NAMES[joint_id], np.sum(bin) / unique_ids.shape[0])

    # Save models to disk
    pickle.dump(regressor, open(regressor_path, 'wb'))
    pickle.dump(L, open(L_path, 'wb'))

    return regressor, L
Exemple #8
0
    def honestTree(self, treePredTrain, treeRespTrain, predTest):
        N = treePredTrain.shape[0]
        idx = random.sample(range(N), N // 2)
        pred1 = treePredTrain[idx, ...]
        resp1 = treeRespTrain[idx]
        pred2 = np.delete(treePredTrain, idx, 0)
        resp2 = np.delete(treeRespTrain, idx, 0)
        tree = DecisionTreeRegressor(min_samples_split=2,
                                     min_samples_leaf=1,
                                     min_impurity_decrease=0.0001,
                                     random_state=self.randomState)
        tree.fit(pred1, resp1)
        predTestNode = tree.apply(predTest)
        predTest = np.column_stack((predTest, predTestNode))

        predTrainSplitNode = tree.apply(pred2)
        predTrainSplitComp = np.column_stack((predTrainSplitNode, resp2))

        aggPredNode = np.unique(predTrainSplitNode)
        aggPredAvg = np.array([
            np.mean(predTrainSplitComp[predTrainSplitComp[..., 0] == node, 1])
            for node in aggPredNode
        ])
        aggPred = np.column_stack((aggPredNode, aggPredAvg))

        if (np.unique(tree.apply(pred1)).size != aggPredNode.size):
            classTree0 = np.setdiff1d(np.unique(tree.apply(pred1)),
                                      aggPredNode)
            append = np.column_stack(
                (classTree0, np.array([0.5] * classTree0.size)))
            aggPred = np.vstack((aggPred, append))

        idx = np.array(
            [np.where(aggPred[..., 0] == node)[0][0] for node in predTestNode])
        predTest = np.column_stack((predTest, ((aggPred[..., 1])[idx])))
        return predTest[..., -1]
def test_leaf_node_kernel_matches_decision_tree():
    """Test the leaf node kernel matches the predictions of a single regression
    tree."""
    boston = load_boston()
    tree = DecisionTreeRegressor(max_depth=3,
                                 random_state=123).fit(boston.data,
                                                       boston.target)
    leaves = tree.apply(boston.data).reshape(-1, 1)

    # predictions using tree kernel
    K = leaf_node_kernel(leaves)
    K /= K.sum(axis=1)
    k_pred = np.dot(K, boston.target)

    y_pred = tree.predict(boston.data)
    np.testing.assert_allclose(k_pred, y_pred)
Exemple #10
0
def trainModel(X, y, jointID, modelsDir, outDir, loadModels=False):
    regressor, L = None, None

    mkdir(outDir + modelsDir)

    regressorPath = outDir + modelsDir + '/regressor' + str(jointID) + '.pkl'
    LPath = outDir + modelsDir + '/L' + str(jointID) + '.pkl'

    if loadModels and os.path.isfile(regressorPath) and os.path.isfile(LPath):
        logger.debug('loading model %s from files...', jointName[jointID])
        regressor = pickle.load(open(regressorPath, 'rb'))
        L = pickle.load(open(LPath, 'rb'))
    else:
        logger.debug('start training model %s...', jointName[jointID])
        regressor = DecisionTreeRegressor(min_samples_leaf=minSamplesLeaf)

        X_reshape = X.reshape(X.shape[0] * X.shape[1], X.shape[2])
        y_reshape = y.reshape(y.shape[0] * y.shape[1], y.shape[2])

        rows = np.logical_not(np.all(X_reshape == 0, axis=1))
        regressor.fit(X_reshape[rows], y_reshape[rows])
        logger.debug('model %s - valid samples: %d/%d', jointName[jointID], \
            X_reshape[rows].shape[0], X_reshape.shape[0])

        leafIDs = regressor.apply(X_reshape)
        bin = np.bincount(leafIDs)
        uniqueIDs = np.unique(leafIDs)
        biggest = np.argmax(bin)
        smallest = np.argmin(bin[bin != 0])

        logger.debug('model %s - #leaves: %d', jointName[jointID], \
                     uniqueIDs.shape[0])
        logger.debug('model %s - biggest leaf id: %d, #samples: %d/%d', \
                     jointName[jointID], biggest, bin[biggest], np.sum(bin))
        logger.debug('model %s - smallest leaf id: %d, #samples: %d/%d', \
                     jointName[jointID], smallest, bin[bin != 0][smallest], \
                     np.sum(bin))
        logger.debug('model %s - average leaf size: %d', jointName[jointID], \
                     np.sum(bin)/uniqueIDs.shape[0])

        L = stochastic(regressor, X_reshape, y_reshape)

        pickle.dump(regressor, open(regressorPath, 'wb'))
        pickle.dump(L, open(LPath, 'wb'))

    return (regressor, L)
Exemple #11
0
    def _get_fitted_model(self, X, y):
        model = DecisionTreeRegressor(
            criterion=self.criterion,
            splitter=self.splitter,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            random_state=self.random_state,
            ccp_alpha=self.ccp_alpha,
        )

        self.model_ = model.fit(X, y)
        self.train_leaf_indices_ = model.apply(X)
    def fit(self, X, y, verbose=False):
        log_odds, initial_probability = self.initial_guess(y)
        n_samples = len(y)

        predictions = np.full((n_samples, ), log_odds)
        self.initial_prediction = log_odds
        predicted_probabilities = np.full((n_samples, ), initial_probability)

        observed_probabilities = np.array([
            (1.0 if label == self.target_classes[0] else 0.0) for label in y
        ])

        for i in range(self.n_estimators):
            if verbose:
                loss = -np.sum(observed_probabilities *
                               np.log(predicted_probabilities) +
                               (1 - observed_probabilities) *
                               np.log(1 - predicted_probabilities))
                print("Building tree " + str(i + 1) + ", Loss: " + str(loss))

            # calculate the residuals
            residuals = observed_probabilities - predicted_probabilities

            # fit a tree to the residuals
            tree = DecisionTreeRegressor(max_leaf_nodes=self.max_leaf_nodes,
                                         max_features=self.max_features)

            # subsampling, stochastic gradient boosting
            train_set = X
            target_set = residuals
            if self.subsample < 1.0:
                train_set, target_set = self.sub_sample(
                    train_set, target_set, n_samples)

            tree.fit(train_set, target_set)
            leaf_indices = tree.apply(X)

            # for each leaf calculate the output value for that leaf
            leaf_outputs = self.calculate_leaf_outputs(
                residuals, leaf_indices, predicted_probabilities)
            self.trees.append(tree)
            self.tree_leaf_outputs.append(leaf_outputs)

            # make new prediction for each sample
            predicted_probabilities, predictions = self.calculate_new_predicitions(
                predictions, leaf_indices, leaf_outputs)
Exemple #13
0
def trainModel(X, y, jointID, modelsDir, outDir, loadModels=False):
    regressor, L = None, None

    mkdir(outDir+modelsDir)

    regressorPath = outDir + modelsDir + '/regressor' + str(jointID) + '.pkl'
    LPath = outDir + modelsDir + '/L' + str(jointID) + '.pkl'

    if loadModels and os.path.isfile(regressorPath) and os.path.isfile(LPath):
        logger.debug('loading model %s from files...', jointName[jointID])
        regressor = pickle.load(open(regressorPath, 'rb'))
        L = pickle.load(open(LPath, 'rb'))
    else:
        logger.debug('start training model %s...', jointName[jointID])
        regressor = DecisionTreeRegressor(min_samples_leaf=minSamplesLeaf)

        X_reshape = X.reshape(X.shape[0]*X.shape[1], X.shape[2])
        y_reshape = y.reshape(y.shape[0]*y.shape[1], y.shape[2])

        rows = np.logical_not(np.all(X_reshape == 0, axis=1))
        regressor.fit(X_reshape[rows], y_reshape[rows])
        logger.debug('model %s - valid samples: %d/%d', jointName[jointID], \
            X_reshape[rows].shape[0], X_reshape.shape[0])

        leafIDs = regressor.apply(X_reshape)
        bin = np.bincount(leafIDs)
        uniqueIDs = np.unique(leafIDs)
        biggest = np.argmax(bin)
        smallest = np.argmin(bin[bin != 0])

        logger.debug('model %s - #leaves: %d', jointName[jointID], \
                     uniqueIDs.shape[0])
        logger.debug('model %s - biggest leaf id: %d, #samples: %d/%d', \
                     jointName[jointID], biggest, bin[biggest], np.sum(bin))
        logger.debug('model %s - smallest leaf id: %d, #samples: %d/%d', \
                     jointName[jointID], smallest, bin[bin != 0][smallest], \
                     np.sum(bin))
        logger.debug('model %s - average leaf size: %d', jointName[jointID], \
                     np.sum(bin)/uniqueIDs.shape[0])

        L = stochastic(regressor, X_reshape, y_reshape)

        pickle.dump(regressor, open(regressorPath, 'wb'))
        pickle.dump(L, open(LPath, 'wb'))

    return (regressor, L)
class TreeRegressionTransformer(BaseTransformer):
    def __init__(self, kwargs={}):
        """
        Doc strings here.
        """

        self.kwargs = kwargs

        self._is_fitted = False

    def fit(self, X, y):
        """
        Doc strings here.
        """

        X, y = check_X_y(X, y)

        # define the ensemble
        self.transformer = DecisionTreeRegressor(**self.kwargs).fit(X, y)

        self._is_fitted = True

        return self

    def transform(self, X):
        """
        Doc strings here.
        """

        if not self.is_fitted():
            msg = (
                "This %(name)s instance is not fitted yet. Call 'fit' with "
                "appropriate arguments before using this transformer."
            )
            raise NotFittedError(msg % {"name": type(self).__name__})

        X = check_array(X)
        return self.transformer.apply(X)

    def is_fitted(self):
        """
        Doc strings here.
        """

        return self._is_fitted
def test_tree_regress():
    """ test to predict for single tree """
    boston = load_boston()
    X = boston.data
    y = boston.target

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)

    estimator = DecisionTreeRegressor()
    estimator.fit(X_train, y_train)
    node_indicator = estimator.decision_path(X_train)
    mean_vals, _ = get_node_means(node_indicator, y_train)
    y_pred_dt = estimator.predict(X_test)
    test_leaves_id = estimator.apply(X_test)
    y_pred_mine_dt = mean_vals[test_leaves_id]
    diff = np.linalg.norm(y_pred_dt - y_pred_mine_dt)
    print('Tree predictions diff :' + repr(diff))
    return
Exemple #16
0
 def fit(self, X, relevence, qid):
     F = np.zeros(np.shape(X)[0])
     eps = 0.000001
     for k in range(self.num_trees):
         lambda_arr = np.array([])
         omega_arr = np.array([])
         for unique_qid in np.unique(qid):
             qid_lambda, qid_omega = self._calculate_lambda(
                 relevence[qid == unique_qid], F[qid == unique_qid],
                 unique_qid)
             lambda_arr = np.append(lambda_arr, qid_lambda)
             omega_arr = np.append(omega_arr, qid_omega)
         tree = DecisionTreeRegressor(max_depth=self.max_depth)
         tree.fit(X, lambda_arr)
         self.trees.append(tree)
         leaves = tree.apply(X)
         for leaf in np.unique(leaves):
             leaf_idx = (leaves == leaf)
             self.gamma[k, leaf] = np.sum(
                 lambda_arr[leaf_idx]) / (np.sum(omega_arr[leaf_idx]) + eps)
             F += self.lr * leaf_idx * self.gamma[k, leaf]
Exemple #17
0
def train(X, y, ntrees = 10, alpha = 0.1, mode='gbdt', epoches=20):
    """
    训练模型
    :param X: 特征
    :param y: 标签
    :param ntrees: 树的棵树
    :param alpha: 学习率
    :param mode: 学习模式, gbdt 一阶算法, xgboost 二阶算法
    :return: 返回参数 trees 返回回归树列表
    """

    #初始化f0
    f = np.log(1e-5 + (np.sum(y)/np.sum(1.0-y)))*np.ones((y.shape[0],))
    #初始化残差为样本值
    r = y
    #params保留构建好的树及叶子节点值
    params = []
    for i in range(0, ntrees):
        tmp_tree = DecisionTreeRegressor(max_depth=1)
        tmp_tree.fit(X, r)

        # 计算残差
        r = cal_residual(f, y)
        leaf_indexes = tmp_tree.apply(X)

        #计算叶子节点的值
        leaf_val = cal_leaf_val(r, leaf_indexes)
        params.append((tmp_tree, leaf_val))

        tmp_val = map(lambda x:leaf_val[x], leaf_indexes)

        #更新f值
        f += alpha * np.array(tmp_val)

        #计算loss并输出
        loss=np.log(1+np.exp(-y*f))
        print ('print res:',r,'print loss:',np.mean(np.sum(loss)))

    #返回树和叶子节点值
    return params
    def fit(self, X_train, Y_train):
        self.regressors = []
        self.init_log_odd = 0
        self.transform_y = False

        if 1 in Y_train.unique() and -1 in Y_train.unique():
            Y_train = Y_train.apply(lambda x: 0 if x == -1 else 1)
            self.transform_y = True

        f0 = np.log(np.sum(Y_train == 1) / np.sum(Y_train == 0))
        self.init_log_odd = f0

        current_log_odds = pd.Series(f0, index=Y_train.index)
        current_gradient = Y_train - expit(current_log_odds.ravel())

        #current_gradient = Y_train - (np.exp(current_log_odds)/(1+np.exp(current_log_odds)))
        for i in range(self.n_estimators):
            rt = DecisionTreeRegressor(max_depth=self.max_depth)
            rt.fit(X_train, current_gradient)
            terminal_regions = rt.apply(X_train).copy()
            for leaf in np.where(rt.tree_.children_left == TREE_LEAF)[0]:
                terminal_region = np.where(terminal_regions == leaf)[0]
                residual = current_gradient.take(terminal_region, axis=0)
                y = Y_train.take(terminal_region, axis=0)
                numerator = np.sum(residual)
                denominator = np.sum((y - residual) * (1 - y + residual))

                if abs(denominator) < 1e-150:
                    rt.tree_.value[leaf, 0, 0] = 0.0
                else:
                    rt.tree_.value[leaf, 0, 0] = numerator / denominator

            self.regressors.append(rt)
            current_log_odds += (
                self.shrinkage_parameter *
                rt.tree_.value[:, 0, 0].take(terminal_regions, axis=0))
            current_gradient = Y_train - expit(current_log_odds.ravel())
Exemple #19
0
  def fit(self, X, y, query_ids):
    """
    Fits the model on the training data.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
      Feature representation of each document.

    y : array-like of shape (n_samples,)
      Relevance scores for each document in query.
      Must be numeric. Preferably {0, 1, 2, 3, 4}

    query_ids : array-like of shape (n_samples,)
      Query ids for given documents.
      Single query ids must go successively.
    
    Returns
    -------
    self : LambdaMART
      Fitted model.
    """
    assert X.shape[0] == len(y)
    n_samples = X.shape[0]

    y_by_query = group_by_ids(y, query_ids)
    model_scores_by_query = [np.zeros(len(scores)) for scores in y_by_query]
    max_dcg_by_query = [max_dcg_score(scores) for scores in y_by_query]
    # max_dcg_at_k(scores, self.dcg_k)
    
    for k in tqdm(range(self.n_trees)):
      lambdas, w = np.zeros(n_samples), np.zeros(n_samples)
      doc_idx = 0
      
      for y, model_scores, max_DCG in zip(y_by_query, model_scores_by_query, max_dcg_by_query):
        n_docs = len(y)
        doc_ranks_predicted = np.zeros(n_docs, dtype=np.int64) 
        doc_ranks_predicted[(-model_scores).argsort()] = np.arange(n_docs)

        for y_i, s_i, rank_i in zip(y, model_scores, doc_ranks_predicted):
          indices_j = (y != y_i)
          y_j, s_j, rank_j = y[indices_j], model_scores[indices_j], doc_ranks_predicted[indices_j]

          delta_DCG = np.abs(
              (np.power(2, y_i) - np.power(2, y_j)) * 
              (1. / np.log2(rank_i + 2.) - 1. / np.log2(rank_j + 2.))
          )
          rho_i_j = 1. / (1. + np.exp(np.abs(s_i - s_j)))
          lambda_i_j = -rho_i_j * delta_DCG

          lambda_i = (np.sign(y_i - y_j) * lambda_i_j).sum() / max_DCG
          w_i = (rho_i_j * (1 - rho_i_j) * delta_DCG).sum() / max_DCG

          lambdas[doc_idx], w[doc_idx] = lambda_i, w_i
          doc_idx += 1
      
      tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_leaf=10)
      tree.fit(X, lambdas)
 
      model_scores = np.concatenate(model_scores_by_query)
      leaf_by_doc_index = tree.apply(X)

      for leaf in set(leaf_by_doc_index):
        one_leaf_docs_indices = np.where(leaf_by_doc_index == leaf)[0]
        gamma_l_k = lambdas[one_leaf_docs_indices].sum() / w[one_leaf_docs_indices].sum()
        tree.tree_.value[leaf] = -gamma_l_k * self.learning_rate
        model_scores[one_leaf_docs_indices] -= gamma_l_k * self.learning_rate
      
      model_scores_by_query = group_by_ids(model_scores, query_ids)
      
      self.trees.append(tree)
Exemple #20
0
class DecisionTreeRegressionModel(RegressionModel):
    """ Wraps sklearn's DecisionTreeRegressor.

    TODO: Beef up the RegressionModel base class and actually enforce a consistent interface.
    TODO: See how much boilerplate we can remove from model creation.
    """

    _PREDICTOR_OUTPUT_COLUMNS = [
        Prediction.LegalColumnNames.IS_VALID_INPUT,
        Prediction.LegalColumnNames.PREDICTED_VALUE,
        Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE,
        Prediction.LegalColumnNames.SAMPLE_VARIANCE,
        Prediction.LegalColumnNames.SAMPLE_SIZE,
        Prediction.LegalColumnNames.PREDICTED_VALUE_DEGREES_OF_FREEDOM
    ]

    def __init__(self,
                 model_config: Point,
                 input_space: Hypergrid,
                 output_space: Hypergrid,
                 logger=None):
        if logger is None:
            logger = create_logger("DecisionTreeRegressionModel")
        self.logger = logger

        assert model_config in decision_tree_config_store.parameter_space
        RegressionModel.__init__(self,
                                 model_type=type(self),
                                 model_config=model_config,
                                 input_space=input_space,
                                 output_space=output_space)

        self._input_space_adapter = CategoricalToDiscreteHypergridAdapter(
            adaptee=self.input_space)

        self.input_dimension_names = [
            dimension.name
            for dimension in self._input_space_adapter.dimensions
        ]
        self.target_dimension_names = [
            dimension.name for dimension in self.output_space.dimensions
        ]
        self.logger.debug(
            f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}."
        )

        assert len(
            self.target_dimension_names
        ) == 1, "For now (and perhaps forever) we only support single target per tree."

        self._regressor = DecisionTreeRegressor(
            criterion=self.model_config.criterion,
            splitter=self.model_config.splitter,
            max_depth=self.model_config.max_depth
            if self.model_config.max_depth != 0 else None,
            min_samples_split=self.model_config.min_samples_split,
            min_samples_leaf=self.model_config.min_samples_leaf,
            min_weight_fraction_leaf=self.model_config.
            min_weight_fraction_leaf,
            max_features=self.model_config.max_features,
            random_state=self.model_config.get("random_state", None),
            max_leaf_nodes=self.model_config.max_leaf_nodes
            if self.model_config.max_leaf_nodes not in (0, 1) else None,
            min_impurity_decrease=self.model_config.min_impurity_decrease,
            ccp_alpha=self.model_config.ccp_alpha)

        # These are used to compute the variance in predictions
        self._observations_per_leaf = dict()
        self._mean_per_leaf = dict()
        self._mean_variance_per_leaf = dict()
        self._sample_variance_per_leaf = dict()
        self._count_per_leaf = dict()

        self._trained = False

    @property
    def trained(self):
        return self._trained

    @property
    def num_observations_used_to_fit(self):
        return self.last_refit_iteration_number

    def should_fit(self, num_samples):
        """ Returns true if the model should be fitted.

        This model should be fitted under the following conditions:
        1) It has not been fitted yet and num_samples is larger than min_samples_to_fit
        2) The model has been fitted and the number of new samples is larger than n_new_samples_before_refit

        :param num_samples:
        :return:
        """
        if not self.trained:
            return num_samples > self.model_config.min_samples_to_fit
        num_new_samples = num_samples - self.num_observations_used_to_fit
        return num_new_samples >= self.model_config.n_new_samples_before_refit

    @trace()
    def fit(self, feature_values_pandas_frame, target_values_pandas_frame,
            iteration_number):
        self.logger.debug(
            f"Fitting a {self.__class__.__name__} with {len(feature_values_pandas_frame.index)} observations."
        )

        # Let's get the numpy arrays out of the panda frames
        #
        feature_values_pandas_frame = self._input_space_adapter.project_dataframe(
            feature_values_pandas_frame, in_place=False)

        feature_values = feature_values_pandas_frame[
            self.input_dimension_names].to_numpy()
        target_values = target_values_pandas_frame[
            self.target_dimension_names].to_numpy()

        # Clean up state before fitting again
        self._observations_per_leaf = dict()

        self._regressor.fit(feature_values, target_values)

        # Now that we have fit the model we can augment our tree by computing the variance
        # TODO: this code can be easily optimized, but premature optimization is the root of all evil.
        node_indices = self._regressor.apply(feature_values)
        self.logger.debug(
            f"The resulting three has {len(node_indices)} leaf nodes.")

        for node_index, sample_target_value in zip(node_indices,
                                                   target_values):
            observations_at_leaf = self._observations_per_leaf.get(
                node_index, [])
            observations_at_leaf.append(sample_target_value)
            self._observations_per_leaf[node_index] = observations_at_leaf

        # Now let's compute all predictions
        for node_index in self._observations_per_leaf:
            # First convert the observations to a numpy array.
            observations_at_leaf = np.array(
                self._observations_per_leaf[node_index])
            self._observations_per_leaf[node_index] = observations_at_leaf

            leaf_mean = np.mean(observations_at_leaf)
            leaf_sample_variance = np.var(
                observations_at_leaf, ddof=1
            )  # ddof = delta degrees of freedom. We want sample variance.
            leaf_mean_variance = leaf_sample_variance / len(
                observations_at_leaf)

            self._mean_per_leaf[node_index] = leaf_mean
            self._mean_variance_per_leaf[node_index] = leaf_mean_variance
            self._sample_variance_per_leaf[node_index] = leaf_sample_variance
            self._count_per_leaf[node_index] = len(observations_at_leaf)

        self._trained = True
        self.last_refit_iteration_number = iteration_number

    @trace()
    def predict(self,
                feature_values_pandas_frame,
                include_only_valid_rows=True):
        self.logger.debug(
            f"Creating predictions for {len(feature_values_pandas_frame.index)} samples."
        )

        # dataframe column shortcuts
        is_valid_input_col = Prediction.LegalColumnNames.IS_VALID_INPUT.value
        predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
        predicted_value_var_col = Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE.value
        sample_var_col = Prediction.LegalColumnNames.SAMPLE_VARIANCE.value
        sample_size_col = Prediction.LegalColumnNames.SAMPLE_SIZE.value
        dof_col = Prediction.LegalColumnNames.PREDICTED_VALUE_DEGREES_OF_FREEDOM.value

        valid_rows_index = None
        features_df = None
        if self.trained:
            valid_features_df = self.input_space.filter_out_invalid_rows(
                original_dataframe=feature_values_pandas_frame,
                exclude_extra_columns=True)
            features_df = self._input_space_adapter.project_dataframe(
                valid_features_df, in_place=False)
            valid_rows_index = features_df.index

        predictions = Prediction(
            objective_name=self.target_dimension_names[0],
            predictor_outputs=self._PREDICTOR_OUTPUT_COLUMNS,
            dataframe_index=valid_rows_index)
        prediction_dataframe = predictions.get_dataframe()

        if valid_rows_index is not None and not valid_rows_index.empty:
            prediction_dataframe['leaf_node_index'] = self._regressor.apply(
                features_df.loc[valid_rows_index].to_numpy())
            prediction_dataframe[predicted_value_col] = prediction_dataframe[
                'leaf_node_index'].map(self._mean_per_leaf)
            prediction_dataframe[
                predicted_value_var_col] = prediction_dataframe[
                    'leaf_node_index'].map(self._mean_variance_per_leaf)
            prediction_dataframe[sample_var_col] = prediction_dataframe[
                'leaf_node_index'].map(self._sample_variance_per_leaf)
            prediction_dataframe[sample_size_col] = prediction_dataframe[
                'leaf_node_index'].map(self._count_per_leaf)
            prediction_dataframe[
                dof_col] = prediction_dataframe[sample_size_col] - 1
            prediction_dataframe[is_valid_input_col] = True
            prediction_dataframe.drop(columns=['leaf_node_index'],
                                      inplace=True)

        predictions.validate_dataframe(prediction_dataframe)
        if not include_only_valid_rows:
            predictions.add_invalid_rows_at_missing_indices(
                desired_index=feature_values_pandas_frame.index)
        return predictions
Exemple #21
0
    def _train_honest_tree(self, df, y_var, w_var, index_cols,
                           min_samples_leaf):
        """
        function that effectively trains each tree in the forest
        """

        if self.algorithm == 'double_sample':
            # step 0 : subsample of df to populate I and J
            df_sample, df_not_sample = train_test_split(df, test_size=0.2)

            df_out = df_not_sample.set_index(index_cols).drop(
                y_var + w_var * (not self.use_w_in_tree), 1)
            y_out = df_not_sample.set_index(index_cols)[y_var]
            W_out = df_not_sample.set_index(index_cols)[w_var]

            s = 0.5
        elif self.algorithm == 'propensity':
            df_sample = df
            s = np.random.uniform(0.3, 0.5)

        # step 1 : splitting (J = train, I = predictions)
        J, I, tau_J, tau_I, W_J, W_I = train_test_split(
            df_sample.set_index(index_cols).drop(
                y_var + w_var * (not self.use_w_in_tree), 1),
            df_sample.set_index(index_cols)[y_var],
            df_sample.set_index(index_cols)[w_var],
            test_size=s)

        # step 2 : training the tree
        if self.algorithm == 'double_sample':
            if not self.true_honest_tree:
                model = DecisionTreeRegressor(criterion='mse',
                                              min_samples_leaf=2 *
                                              min_samples_leaf)
            else:
                model = decision_tree.DecisionTree(min_samples_leaf)

            # J is used for training
            model.fit(J, tau_J)

            # I is used for prediction and pruning
            model = self._prune_tree(model, I, W_I, w_var, min_samples_leaf)
            X_prediction, tau_prediction, W_prediction = I, tau_I, W_I

        elif self.algorithm == 'propensity':
            model = ExtraTreeClassifier(criterion='gini',
                                        min_samples_leaf=2 * min_samples_leaf,
                                        splitter='random')

            # we use J for training, but this time the target is the treament class variable
            model.fit(J, W_J)

            # pruning and prediction in J
            model = self._prune_tree(model, I, W_I, w_var, min_samples_leaf)
            X_prediction, tau_prediction, W_prediction = J, tau_J, W_J

        # creating a dataframe with the predictions by leaf
        leaves = X_prediction[[]].copy()
        leaves['leaf'] = model.apply(X_prediction)
        leaves['true'] = tau_prediction
        leaves[w_var] = W_prediction
        leaves = leaves.groupby(['leaf'] + w_var).true.mean().reset_index()

        # predicting
        if self.full_predictor and self.algorithm == 'double_sample':
            # if full, we predict for everyone
            X_prediction = pd.concat([df_out, X_prediction])

        this_preds = X_prediction[[]].copy()
        this_preds['leaf'] = model.apply(X_prediction)
        this_preds.reset_index(inplace=True)

        return leaves, this_preds, model
Exemple #22
0
class GroupPCADecisionTreeRegressor(BaseEstimator, RegressorMixin):
    """ PCA on random group of features followed by a Decision Tree
    
    See : GroupPCA and DecisionTreeRegressor
    """

    def __init__(
        self,
        criterion="mse",
        splitter="best",
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        presort=False,
        pca_bootstrap=False,
        pca_max_nb_groups=0.25,
        pca_max_group_size=0.05,
    ):

        self.criterion = criterion
        self.splitter = splitter
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        self.presort = presort

        self.pca_bootstrap = pca_bootstrap
        self.pca_max_nb_groups = pca_max_nb_groups
        self.pca_max_group_size = pca_max_group_size

        self._tree = None
        self._group_pca = None

    def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None):

        self.n_features_ = X.shape[1]

        # 1) create GroupPCA
        self._group_pca = GroupPCA(
            random_state=self.random_state,
            bootstrap=self.pca_bootstrap,
            max_nb_groups=self.pca_max_nb_groups,
            max_group_size=self.pca_max_group_size,
        )
        # 2) Create Tree
        self._tree = DecisionTreeRegressor(
            criterion=self.criterion,
            splitter=self.splitter,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            random_state=self.random_state,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            presort=self.presort,
        )

        # 3) Apply group PCA
        Xpca = self._group_pca.fit_transform(X, y)

        # 4) fit Tree
        self._tree.fit(Xpca, y, sample_weight=sample_weight, check_input=check_input, X_idx_sorted=None)

        return self

    def predict(self, X, check_input=True):

        if self._tree is None:
            raise NotFittedError("You should fit the model first")

        Xpca = self._group_pca.transform(X)
        return self._tree.predict(Xpca, check_input=check_input)

    def apply(self, X, check_input=True):

        if self._tree is None:
            raise NotFittedError("You should fit the model first")

        Xpca = self._group_pca.transform(X)

        return self._tree.apply(Xpca, check_input=check_input)

    def decision_path(self, X, check_input=True):
        Xpca = self._group_pca.transform(X)

        return self._tree.decision_path(Xpca, check_input=check_input)

    @property
    def tree_(self):
        return self._tree.tree_

    def _validate_X_predict(self, X, check_input):
        """Validate X whenever one tries to predict, apply, predict_proba"""
        if check_input:
            X = check_array(X, dtype=DTYPE, accept_sparse="csr")
            if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
                raise ValueError("No support for np.int64 index based " "sparse matrices")

        n_features = X.shape[1]
        if self.n_features_ != n_features:
            raise ValueError(
                "Number of features of the model must "
                "match the input. Model n_features is %s and "
                "input n_features is %s " % (self.n_features_, n_features)
            )

        return X
def train(joint_id, X, y, model_dir, samples_leaf, k_value, num_samples,
          xy_offset):
    """Trains a regressor tree on the unit directions towards the joint.

    @params:
        joint_id : current joint id
        X : samples feature array (N x num_samples x num_feats)
        y : samples unit direction vectors (N x num_samples x 3)
        min_samples_split : minimum number of samples required to split an internal node
        load_models : load trained models from disk (if exist)
    """
    logger.debug('Start training %s model...', JOINT_NAMES[joint_id])

    #regressor_path = os.path.join(model_dir, 'regressor' + str(joint_id) + '.pkl')
    #L_path = os.path.join(model_dir, 'L' + str(joint_id) + '.pkl')

    X_reshape = X.reshape(X.shape[0] * X.shape[1],
                          X.shape[2])  # (N x num_samples, num_feats)
    y_reshape = y.reshape(y.shape[0] * y.shape[1],
                          y.shape[2])  # (N x num_samples, 3)

    # Count the number of valid (non-zero) samples
    valid_rows = np.logical_not(np.all(X_reshape == 0,
                                       axis=1))  # inverse of invalid samples
    logger.debug('Model %s - Valid samples: %d / %d', JOINT_NAMES[joint_id],
                 X_reshape[valid_rows].shape[0], X_reshape.shape[0])

    #regressor = joblib.load(regressor_path)
    #L = joblib.load(L_path)
    # Fit decision tree to samples
    regressor = DecisionTreeRegressor(min_samples_leaf=samples_leaf)
    regressor.fit(X_reshape[valid_rows], y_reshape[valid_rows])
    L = stochastic(regressor, X_reshape, y_reshape, k_value)

    # Print statistics on leafs
    leaf_ids = regressor.apply(X_reshape)
    bin = np.bincount(leaf_ids)
    unique_ids = np.unique(leaf_ids)
    biggest = np.argmax(bin)
    smallest = np.argmin(bin[bin != 0])

    logger.debug('Model %s - # Leaves: %d', JOINT_NAMES[joint_id],
                 unique_ids.shape[0])
    logger.debug('Model %s - Smallest Leaf ID: %d, # Samples: %d/%d',
                 JOINT_NAMES[joint_id], smallest, bin[bin != 0][smallest],
                 np.sum(bin))
    logger.debug('Model %s - Biggest Leaf ID: %d, # Samples: %d/%d',
                 JOINT_NAMES[joint_id], biggest, bin[biggest], np.sum(bin))
    logger.debug('Model %s - Average Leaf Size: %d', JOINT_NAMES[joint_id],
                 np.sum(bin) / unique_ids.shape[0])

    # Save models to disk
    folder = 'dl_%s_%d_%d_%d_%d/' % (TRAIN_SET, k_value, samples_leaf,
                                     num_samples, xy_offset)
    if not os.path.exists(os.path.join(model_dir, folder)):
        os.makedirs(os.path.join(model_dir, folder))
    regressor_path = os.path.join(model_dir, folder,
                                  'regressor' + str(joint_id) + '.pkl')
    L_path = os.path.join(model_dir, folder, 'L' + str(joint_id) + '.pkl')
    #vectors_path = os.path.join(model_dir, folder, 'vector' + str(joint_id) + '.pkl')
    #pickle.dump(regressor, open(regressor_path, 'wb'))
    #pickle.dump(L, open(L_path, 'wb'))
    joblib.dump(regressor, regressor_path)
    joblib.dump(L, L_path)

    return regressor, L
Exemple #24
0
    def __clustering(self, X, y=None):
        """
        The clustering procedure of the Optimal Weighted Clustering Gaussian 
        Process. This function should not be called externally
        """

        if self.cluster_method == "k-mean":
            clusterer = KMeans(n_clusters=self.n_cluster)
            clusterer.fit(X)
            self.cluster_label = clusterer.labels_
            self.clusterer = clusterer
        elif self.cluster_method == "tree":
            print "Warning: specified clustering count might be overwritten"
            minsamples = int(len(X) / (self.n_cluster + 1))
            tree = DecisionTreeRegressor(random_state=0, min_samples_leaf=minsamples)
            tree.fit(X, y)
            labels = tree.apply(X)
            clusters = np.unique(labels)
            k = len(clusters)
            print "leafs:", k
            self.n_cluster = k
            self.leaf_labels = np.unique(labels)
            self.cluster_label = labels
            self.clusterer = tree
        elif self.cluster_method == "random":
            r = self.n_sample % self.n_cluster
            m = (self.n_sample - r) / self.n_cluster
            self.cluster_label = array(range(self.n_cluster) * m + range(r))
            self.clusterer = None
            shuffle(self.cluster_label)
        elif self.cluster_method == "GMM":  # GMM from sklearn
            self.clusterer = GMM(n_components=self.n_cluster, n_iter=1000)
            self.clusterer.fit(X)
            self.cluster_labels_proba = self.clusterer.predict_proba(X)
            self.cluster_label = self.clusterer.predict(X)
        elif self.cluster_method == "fuzzy-c-mean":  # Fuzzy C-means from sklearn
            cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
                X.T, self.n_cluster, 2, error=0.000005, maxiter=10000, init=None
            )
            self.clusterer = cntr  # save the centers for cmeans_predict
            self.cluster_labels_proba = u.T
            self.cluster_labels_proba = np.array(self.cluster_labels_proba)
            self.cluster_label = np.argmax(u, axis=0)
            self.cluster_label = np.array(self.cluster_label)
        elif self.cluster_method == "flame":  # Flame clustering, files are attached
            print "Warning: specified clustering count will be overwritten with Flame"
            flameobject = flame.Flame_New()
            tempdata = X.astype(np.float32)
            N = len(tempdata)
            flameobject = flame.Flame_New()
            flame.Flame_SetDataMatrix(flameobject, tempdata, 0)
            flame.Flame_DefineSupports(flameobject, self.flame_knn, self.flame_threshold)  # knn is number of neighbours
            cso_count = flameobject.cso_count
            # print "done, found ", cso_count, " clusters"
            k = cso_count + 1  #!!! overwrite k here
            self.n_cluster = k
            print "clusters:", k
            flame.Flame_LocalApproximation(flameobject, 500, 1e-6)
            self.cluster_labels_proba = flame.Print_Clusters(flameobject, (cso_count + 1) * N)
            flame.Flame_Clear(flameobject)
            self.cluster_labels_proba = self.cluster_labels_proba.reshape((N, cso_count + 1))
            self.clusterer = None  # we need to assign something
Exemple #25
0
 # scaling is not necessary for decision trees
 small_dTree = DecisionTreeRegressor(max_depth=2)
 small_dTree.fit(train_X[["median_income"]], train_y)
 small_dTree_pred = small_dTree.predict(train_X[["median_income"]])
 trees.tree_to_code(small_dTree, ['median_income'])
 print(f" feature: {small_dTree.tree_.feature}")
 print(f" child left: {small_dTree.tree_.children_left}")
 print(f" child left idx 4: {small_dTree.tree_.children_left[4]}")
 print(f" child right: {small_dTree.tree_.children_right}")
 print(f" tree value idx 2: {small_dTree.tree_.value[2]}")
 print(f" ")
 rss = np.sum((train_y - small_dTree_pred)**2)
 print(f" rss: {rss}")
 thresh_node = small_dTree.tree_.threshold[1]
 samples_in_leaves = small_dTree.apply(train_X[[
     "median_income"
 ]])  # gibt pro Zeile die Zugehörigkeit zum terminal node aus
 print(samples_in_leaves)
 print(pd.value_counts((samples_in_leaves)))
 #use impurity measure?
 print(small_dTree.tree_.children_left[2])
 print(f" child left idx 2: {small_dTree.tree_.children_left[2]}")
 print(f" tree leaf: {_tree.TREE_LEAF}")
 trees.depth_first(small_dTree.tree_, 0)
 print(f" node count: {small_dTree.tree_.node_count}")  # total nr of nodes
 print(f" node impurity idx 0: {small_dTree.tree_.impurity[0]}")
 print(
     f" impurity * n samples at idx 0: {small_dTree.tree_.impurity[0]*small_dTree.tree_.n_node_samples[0]}"
 )
 rss_mean = np.sum((train_y - np.mean(train_y))**2)
 print(f" pred error of mean: {rss_mean}")
Exemple #26
0
            meanSquare = np.sqrt(meanSquare[0])
            if(linearReg < linearRegBest):
                linearRegBest = linearReg
                meanSquareBest = meanSquare
                print(leftModel)
                model = [leftModel, rightModel]
                print(model)
    return linearReg, meanSquareBest

modelTree, meanSquare = ModelTree();
print(modelTree," ",meanSquare)
print ("Time taken to build the model: ",datetime.now() - startTime)

node_indicator = regressionTree.decision_path(X_test)

leave_id = regressionTree.apply(X_test)

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
class Breiman_Tree:

    '''
    Main class for Breiman Tree version of active learning algorithm
    '''

    def __init__(self, min_samples_leaf=None, seed=None):

        self.points = None
        self.labels = None
        self.labelled_indices = None
        self._num_points = 0
        self._num_labelled = 0

        if seed is None:
            self.seed = 0
        else:
            self.seed = seed

        if min_samples_leaf is None:
            self.min_samples_leaf=1
        else:
            self.min_samples_leaf=min_samples_leaf

        self.tree = DecisionTreeRegressor(random_state=self.seed,min_samples_leaf=self.min_samples_leaf)
        self._leaf_indices = []
        self._leaf_marginal = []
        self._leaf_var = []
        self._al_proportions =[]

        self._leaf_statistics_up_to_date = False
        self._leaf_proportions_up_to_date = False

        self._verbose = False

    def input_data(self, all_data, labelled_indices, labels, copy_data=True):

        if copy_data:
            all_data = copy.deepcopy(all_data)
            labelled_indices = copy.deepcopy(labelled_indices)
            labels = copy.deepcopy(labels)

        if len(all_data) < len(labelled_indices):
            raise ValueError('Cannot have more labelled indicies than points')

        if len(labelled_indices) != len(labels):
            raise ValueError('Labelled indicies list and labels list must be same length')

        if str(type(all_data)) == "<class 'numpy.ndarray'>":
            if self._verbose:
                print('Converting all_data to list of lists internally')
            all_data = all_data.tolist()

        if str(type(labelled_indices)) == "<class 'numpy.ndarray'>":
            if self._verbose:
                print('Converting labelled_indices to list internally')
            labelled_indices = labelled_indices.tolist()

        if str(type(labels)) == "<class 'numpy.ndarray'>":
            if self._verbose:
                print('Converting labels to list internally')
            labels = labels.tolist()

        self.points = all_data
        self._num_points = len(self.points)
        self._num_labelled = len(labels)

        # Making a label list, with None in places where we don't have the label

        temp = [None] * self._num_points
        for i,ind in enumerate(labelled_indices):
            temp[ind] = labels[i]
        self.labels = temp
        self.labelled_indices = list(labelled_indices)

    def fit_tree(self):
        self.tree.fit(np.array(self.points)[self.labelled_indices,:], 
            np.array(self.labels)[self.labelled_indices])
        self._leaf_indices = self.tree.apply(np.array(self.points))
        self._leaf_statistics_up_to_date = False

    def label_point(self, index, value):

        if self.labels is None:
            raise RuntimeError('No data in the tree')

        if len(self.labels) <= index:
            raise ValueError('Index {} larger than size of data in tree'.format(index))

        value = copy.copy(value)
        index = copy.copy(index)

        self.labels[index] = value
        self.labelled_indices.append(index)
        self._num_labelled += 1

    def predict(self, new_points):
        return(self.tree.predict(new_points))

    def calculate_leaf_statistics(self):
        temp = Counter(self._leaf_indices)
        self._leaf_marginal = []
        self._leaf_var = []
        for key in np.unique(self._leaf_indices):
            self._leaf_marginal.append(temp[key]/self._num_points)
            temp_ind = [i for i,x in enumerate(self._leaf_indices) if x == key]
            temp_labels = [x for x in self.labels if x is not None]
            self._leaf_var.append(utils.unbiased_var(temp_labels))
        self._leaf_statistics_up_to_date = True

    def al_calculate_leaf_proportions(self):
        if not self._leaf_statistics_up_to_date:
            self.calculate_leaf_statistics()
        al_proportions = []
        for i, val in enumerate(self._leaf_var):
            al_proportions.append(np.sqrt(self._leaf_var[i] * self._leaf_marginal[i]))
        al_proportions = np.array(al_proportions)/sum(al_proportions)
        self._al_proportions = al_proportions
        self._leaf_proportions_up_to_date = True

    def pick_new_points(self, num_samples = 1):
        if not self._leaf_proportions_up_to_date:
            self.al_calculate_leaf_proportions()

        temp = Counter(np.array(self._leaf_indices)[[x for x in range(self._num_points
            ) if self.labels[x] is None]])
        point_proportions = {}
        for i,key in enumerate(np.unique(self._leaf_indices)):
            point_proportions[key] = self._al_proportions[i] / max(1,temp[key]) 
        temp_probs = np.array([point_proportions[key] for key in self._leaf_indices])
        temp_probs[self.labelled_indices] = 0
        temp_probs = temp_probs / sum(temp_probs)
        # print(sum(temp_probs))
        leaves_to_sample = np.random.choice(self._leaf_indices,num_samples, 
            p=temp_probs, replace = False)
        points_to_label = []
        for leaf in leaves_to_sample:
            possible_points = [x for i,x in enumerate(range(self._num_points)
                ) if self._leaf_indices[i] ==leaf and self.labels[i] is None]
            points_to_label.append(np.random.choice(possible_points))

        return(points_to_label)
class DecisionTreeSVRRegressor():
    #class that hold svr model in the tree leaves
    #implements all of DecisionTree methods and properties
    def __init__(self,
                 params_svr = {'kernel': 'rbf', 'epsilon': 0.2},
                 criterion="mse",
                 splitter="best",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_features=None,
                 random_state=None,
                 max_leaf_nodes=None,
                 min_impurity_split=1e-7,
                 presort=False):
        self.base_tree = DecisionTreeRegressor(
            criterion=criterion,
            splitter='best',
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            random_state=random_state,
            presort=presort)

        self.params_svr = params_svr
        self.x_leaves = {}
        self.y_leaves = {}
        self.svrs_leaves = {}
        self.tree_ = None

    def fit(self, X, y, sample_weight=None, check_input=True,
            X_idx_sorted=None):

        self.base_tree.fit(X, y, sample_weight, check_input,
            X_idx_sorted)

        self.tree_ = self.base_tree.tree_

        leaves = self.base_tree.apply(X)

        for x_id in range(0, X.shape[0]):
            leaf_id = int(leaves[x_id])
            if not self.x_leaves.has_key(leaf_id):
                self.x_leaves[leaf_id] = []
                self.y_leaves[leaf_id] = []
            self.x_leaves[leaf_id].append(X[x_id])
            self. y_leaves[leaf_id].append(y[x_id])


        for leaf_id in self.x_leaves.keys():
            svr = SVR(**self.params_svr)
            svr.fit(self.x_leaves[leaf_id], self.y_leaves[leaf_id])
            self.svrs_leaves[leaf_id] = svr

        return self

    def predict(self, X, check_input=True):
        y = []
        x_leaves = self.base_tree.apply(X)
        for x in X:
            leaf = x_leaves[x]
            svr = self.svrs_leaves[leaf]
            pred = svr.predict(x)[0]
            y.append(pred)
        return y

    def apply(self, X, check_input=True):
        return self.base_tree.apply(X)

    def decision_path(self, X, check_input=True):
        return self.decision_path.apply(X)

    def feature_importances_(self):
        return self.decision_path.feature_importances_()


    def _validate_X_predict(self, X, check_input=True):
        return self.base_tree._validate_X_predict(X, check_input )
Exemple #29
0
class DTALE(object):
    """
    A decision tree the learns to make same predictions as the nucleus model
    classifier, but using interpretable features. Note that the tree is
    learning using the mask-rcnn model predictions, NOT the gtruth. The idea is
    to find an interpretable approximation of what the classification component
    of the model seems to be relying on. We rely on a REGRESSION tree for a
    more refined approximation of the model behavior.

    References:
    -----------
        Amgad M, Atteya LA, Hussein H, Mohammed KH, Hafiz E, Elsebaie MA,
        Mobadersany P, Manthey D, Gutman DA, Elfandy H, Cooper LA. Explainable
        nucleus classification using Decision Tree Approximation of Learned
        Embeddings. Bioinformatics. 2021 Sep 29.

    """
    def __init__(
        self,
        feats: DataFrame,
        clusts: DataFrame,
        savedir: str,
        pcoln: str = 'pred_categ',
        ecoln0: str = 'embedding_0',
        ecoln1: str = 'embedding_1',
        classes_list: List = None,
        fitkwargs: Dict = None,
    ):
        """

        Parameters
        ----------
        feats: DataFrame
            A dataframe of interpretable features per nucleus. Rows are correspond
            to nuclei and columns correspond to features. Must have the same
            index and no of rows as the `clusts` parameter.
        clusts: DataFrame
            A dataframe that is indexed by nucleus name or i.d., and has at least three
            columns, whose names are controlled by the `pcoln`, `ecoln0` and `ecoln1`
            parameters. The columns encode the nucleus classification labels, first
            embedding dimension value, and second embedding dimension value. Must have
            the same index and no of rows as `feats`.
        savedir: str
            Directory to save model, figures, and other results.
        pcoln: str
            Name of column encoding classification label of nuclei in `clusts`.
        ecoln0: str
            Name of column encoding first embedding value for nuclei in `clusts`.
        ecoln1: str
            Name of column encoding second embedding value for nuclei in `clusts`.
        classes_list: List
            Optional, set of unique classification classes. Extracted automatically
            if not provided.
        fitkwargs: Dict
            kwargs to pass to DecisionTreeRegressor. Default values used in the
            DTALE paper are used if this parameter is not provided.

        """

        # drop nans
        clusts = clusts.dropna(axis=0)
        feats = feats.loc[clusts.index, :]
        feats = feats.dropna(axis=0)
        clusts = clusts.loc[feats.index, :]
        self.feats = feats
        self.clusts = clusts

        # some ground work
        self.pcoln = pcoln
        self.ecoln0 = ecoln0
        self.ecoln1 = ecoln1
        _, y = self._getxy()
        self._e0min = y[:, 0].min()
        self._e0max = y[:, 0].max()
        self._e1min = y[:, 1].min()
        self._e1max = y[:, 1].max()

        # assign params or defaults
        self.classes_list = classes_list or list(
            set(clusts.loc[:, pcoln].tolist()))
        self.fitkwargs = fitkwargs or {
            'random_state': 0,
            'min_samples_leaf': 250,  # best: 250
            'max_depth': 7,  # best: 7
        }

        # init attribs
        self.savedir = savedir
        self.featnames = np.array(feats.columns)
        self.model = None
        self.tree = None
        self.n_nodes = None
        self.pred_y_leafs = None
        self.leafs = None
        self.nodes = None
        self.node_leafs = {}
        self.node_tally = {}

    def _getxy(self):
        X = self.feats.values
        y = self.clusts.loc[:, [self.ecoln0, self.ecoln1]].values
        return X, y

    def fit_model(self):
        """Fit a DTALE model."""

        # fit regressor to predict embeddings from NuCLS model
        self.model = DecisionTreeRegressor(**self.fitkwargs)
        X, y = self._getxy()
        self.model.fit(X, y)
        self.tree = self.model.tree_

        # save model for reproducibility
        with open(opj(self.savedir, 'dectree.pkl'), 'wb') as f:
            pickle.dump(self.model, f)

        # # load model
        # with open(opj(savedir, 'dectree.pkl'), 'rb') as f:
        #     loaded_model = pickle.load(f)

        # # show tree text
        # r = export_text(regr, feature_names=list(feats.columns))
        # print(r)

    def apply_model(self):

        self.n_nodes = self.tree.node_count
        self.leafs = np.argwhere(self.tree.children_left == -1)[:, 0].tolist()
        self.nodes = {i for i in range(self.n_nodes)}.difference(self.leafs)

        # Apply to training data
        X, _ = self._getxy()
        self.pred_y_leafs = self.model.apply(X)
        # self.pred_y_vals = self.tree.value[self.pred_y_leafs, :, 0]

    def _find_leaves_in_subtree(self, root, subtrees):
        """find all the leaves enclosed within a subtree."""

        leafs = []

        def _traverse(node):

            # dynamic programming
            if node in subtrees:
                leafs.extend(subtrees[node])
                return
            subtrees[node] = []

            children = (
                self.tree.children_left[node],
                self.tree.children_right[node],
            )
            if children[0] == -1:
                leafs.append(node)
                subtrees[node].append(node)
                return
            for child in children:
                _traverse(child)

        _traverse(node=root)
        subtrees[root] = leafs

        return subtrees

    def set_leafs_for_all_subtrees(self):
        """Get all subleafs enclosed within each node subtree."""
        # traverse from bottom up for dynamic programming speedup
        for nd in range(self.n_nodes - 1, -1, -1):
            self._find_leaves_in_subtree(root=nd, subtrees=self.node_leafs)

    def set_node_tally(self):
        """
        Get a tally of the number of points from each class (as determined
        by the NuCLS model final prediction) for each node.
        """
        self.node_tally = {
            leaf: Counter(self.clusts.loc[self.pred_y_leafs == leaf,
                                          self.pcoln].to_list())
            for leaf in self.leafs
        }
        for node, nlfs in self.node_leafs.items():
            if node in self.leafs:
                continue
            self.node_tally[node] = self.node_tally[nlfs[0]]
            for nlf in nlfs[1:]:
                self.node_tally[node] += self.node_tally[nlf]

    def _get_best_node_for_class(self, cls, metric):
        """
        For one class, find the cluster (node) which overlaps the most
        with the predictions from the NuCLS model
        """
        best_node = None
        best_stats = {metric: -1. if metric == 'MCC' else 0.}
        for node in self.nodes:
            innode = 0 + np.in1d(self.pred_y_leafs, self.node_leafs[node])
            incls = 0 + (self.clusts.loc[:, self.pcoln] == cls).values
            stats = calc_stats_simple(
                TP=np.sum(innode + incls == 2),
                FP=np.sum(innode - incls == 1),
                TN=np.sum(innode + incls == 0),
                FN=np.sum(innode - incls == -1),
            )
            stats['MCC'] = matthews_corrcoef(y_true=incls, y_pred=innode)
            if stats[metric] > best_stats[metric]:
                best_node = node
                best_stats.update(stats)
        return best_node, best_stats

    def _get_best_node_for_each_class(self, metric='precision'):
        """
        For each class, find the cluster (node) which best fits/explains
        predictions from the NuCLS model

        IMPORTANT NOTE:
          The classes are INDEPENDENT of each other. So an early "tumor" node
          does NOT exclude the descendent "mitotic" node. This is EXPECTED
          and cannot be overcome because the nodes are not pure .. even the
          downstream "mitotic" node contains some tumor leafs, so excluding
          it would reduce recall of the "tumor" node. Best way is to think of
          these paths as being independent for different classes.
        """
        best_nodes = {}
        best_stats = {}
        for cls in self.classes_list:
            best_nodes[cls], best_stats[cls] = self._get_best_node_for_class(
                cls, metric=metric)
        return best_nodes, best_stats

    def _trace_from_node_to_root(self, node):
        trace = [node]
        direction = [0]
        current_node = node
        keep_going = True
        while keep_going:
            left = np.argwhere(self.tree.children_left == current_node)
            right = np.argwhere(self.tree.children_right == current_node)
            if len(left) > 0:
                current_node = left[0, 0]
                trace.append(current_node)
                direction.append(-1)
            elif len(right) > 0:
                current_node = right[0, 0]
                trace.append(current_node)
                direction.append(1)
            else:
                keep_going = False
        return trace, direction

    def save_dectree_traces(self, best_nodes, best_stats, postfix=''):
        """Save decision tree traces for relevant classes."""

        node_trace = {}
        direction_trace = {}
        feat_trace = {}
        impurity_trace = {}
        nsize_trace = {}
        thresh_trace = {}
        nice_trace = {}

        for cls in self.classes_list:

            # track from node to root
            ntrace, dtrace = self._trace_from_node_to_root(best_nodes[cls])
            ntrace, dtrace = ntrace[::-1], dtrace[::-1]
            node_trace[cls], direction_trace[cls] = ntrace, dtrace

            # map nodes to feature names and thresholds
            feat_trace[cls] = self.featnames[
                self.tree.feature[ntrace]].tolist()
            impurity_trace[cls] = self.tree.impurity[ntrace].tolist()
            nsize_trace[cls] = self.tree.n_node_samples[ntrace].tolist()
            thresh_trace[cls] = self.tree.threshold[ntrace].tolist()

            # render into nice text
            descr = '\nDECISIONS:\n'
            descr += "--------------\n"
            for nix in range(len(ntrace) - 1):
                dhere = ' '.join([
                    feat_trace[cls][nix], '<=' if dtrace[nix] == -1 else '>',
                    '%.1f' % thresh_trace[cls][nix]
                ])
                descr += dhere + '\n'
            descr += f'\nSTATS:\n'
            descr += "--------------\n"
            descr += '\n'.join(
                [f'{st}: %.2f' % stv
                 for st, stv in best_stats[cls].items()]) + '\n'
            nice_trace[cls] = descr

        # parse into a dict and pickle
        with open(opj(self.savedir, f'dectree_traces{postfix}.pkl'),
                  'wb') as f:
            pickle.dump(
                {
                    'features': feat_trace,
                    'thresholds': thresh_trace,
                    'impurity': impurity_trace,
                    'nodes': node_trace,
                    'direction': direction_trace,
                    'node_n_samples': nsize_trace,
                    'nice': nice_trace,

                    # How well the "chosen" traces from our decision tree
                    # fit/explain the NuCLS model predictions. For example, a
                    # precision of 0.9 for 'tumor' means that 90% of the nuclei
                    # predicted as 'tumor' by our decision tree are also predicted
                    # as 'tumor' by the NuCLS model.
                    'fit_stats_to_NuCLS_model': best_stats,
                },
                f)

        # save nice rendered text for relevant parts of tree
        with open(opj(self.savedir, f'dectree_nice{postfix}.txt'), 'w') as f:
            for cls in self.classes_list:
                f.write("***********************************\n"
                        f"{cls}\n"
                        "***********************************\n")
                f.write(nice_trace[cls] + '\n')

    def visualize_decision_tree_nodes(self, best_nodes, postfix=''):
        """Visualize the learned decision tree nodes."""

        plt.figure(figsize=(7, 7))

        # scatter actual points from NuCLS model in background
        _, y = self._getxy()
        plt.scatter(y[:, 0],
                    y[:, 1],
                    c='beige',
                    alpha=0.6,
                    s=4,
                    edgecolors='none')

        # trace the learned decision tree
        for node in range(self.tree.node_count):
            if self.tree.children_left[node] == -1:
                continue
            me = self.tree.value[node, :, 0]
            clt = self.tree.value[self.tree.children_left[node], :, 0]
            crt = self.tree.value[self.tree.children_right[node], :, 0]
            plt.plot(
                [clt[0], me[0], crt[0]],
                [clt[1], me[1], crt[1]],
                color='gray',
                marker='.',
                linestyle='-',
                linewidth=0.5,
                markersize=3,
                alpha=0.5,
            )

        # highligh root node
        me = self.tree.value[0, :, 0]
        plt.scatter([me[0]], [me[1]],
                    color='k',
                    s=30,
                    alpha=1.,
                    edgecolors='k')

        # color best (class-representative) nodes by class
        for cls, node in best_nodes.items():

            me = self.tree.value[node, :, 0]

            # color the trace along the decision tree till best node
            trace, _ = self._trace_from_node_to_root(node)
            for ndi in range(len(trace) - 1):
                clt = self.tree.value[trace[ndi], :, 0]
                crt = self.tree.value[trace[ndi + 1], :, 0]
                plt.plot(
                    [clt[0], crt[0]],
                    [clt[1], crt[1]],
                    color='k',
                    alpha=1.,
                    marker='o',
                    markersize=2.5,
                    linestyle='-',
                    linewidth=1.3,
                )

            # highlight actual chosen best node
            color = np.array(VisConfigs.CATEG_COLORS[cls])[None, :] / 255.
            plt.scatter([me[0]], [me[1]],
                        color=color,
                        s=150,
                        alpha=1.,
                        edgecolors='none')

        plt.xlim(self._e0min, self._e0max)
        plt.ylim(self._e1min, self._e1max)
        plt.title(f'DTALE nodes ({postfix})', fontsize=14, fontweight='bold')
        # plt.show()
        # plt.savefig(opj(self.savedir, f'dectree{postfix}.svg'))
        plt.savefig(opj(self.savedir, f'dectree{postfix}.png'))

    def visualize_decision_tree_classes(self,
                                        best_nodes,
                                        classes_list=None,
                                        restrict_to_pcateg=False,
                                        exclude_leafs=None,
                                        savedir=None,
                                        postfix=''):
        """Visualize embeddings, colors by class predicted by decision tree."""

        classes_list = classes_list or self.classes_list
        savedir = savedir or self.savedir

        init_point_size = 10.
        point_size_ds = 1.
        alphas = [0.8, 0.5]
        _, y = self._getxy()

        plt.figure(figsize=(7, 7))

        point_size = init_point_size
        alphas = np.linspace(alphas[0], alphas[1], len(classes_list))

        # keep track of plotted indices to be able to exclude downstream
        # nodes when plotting upstream ones when relevant
        kept_idxs = []

        for clno, cls in enumerate(classes_list):

            # maybe restrict to leafs predicted as a particular class by NuCLS
            keep1 = None
            if restrict_to_pcateg:
                keep1 = (self.clusts.loc[:, 'pred_categ'] == cls).values

            # restrict to downstream leafs to node of interest
            keep2 = np.in1d(self.pred_y_leafs,
                            self.node_leafs[best_nodes[cls]])  # noqa
            if keep1 is None:
                keep = keep2
            else:
                keep = keep1 & keep2

            # maybe exclude certain leafs
            if exclude_leafs is not None:
                keep[exclude_leafs] = False

            # keep track of kept idxes
            kept_idxs.extend(np.argwhere(keep)[:, 0].tolist())

            # now restrict to leaves of interes
            y_subset = y[keep, :]

            # plot
            plt.scatter(y_subset[:, 0],
                        y_subset[:, 1],
                        c=np.array(VisConfigs.CATEG_COLORS[cls])[None, :] /
                        255.,
                        alpha=alphas[clno],
                        s=point_size,
                        edgecolors='none')

            point_size = point_size_ds * point_size

        plt.xlim(self._e0min, self._e0max)
        plt.ylim(self._e1min, self._e1max)
        plt.title(f'DTALE decisions ({postfix})',
                  fontsize=14,
                  fontweight='bold')
        # plt.show()
        # plt.savefig(opj(savedir, f'dectreeCol{postfix}.svg'))
        plt.savefig(opj(savedir, f'dectreeCol{postfix}.png'))

        return kept_idxs

    def save_and_plot_optimized_decision_paths(self):
        """
        Use different metrics to emphasize different things learned:
        - F1 score: typical case (most tumor nuclei in the dataset)
          VERSUS ...
        - precision: most discriminative case (textbook examples).
        Using F-1 helps us find nodes in our decision tree that correlate to
        the process used by the NuCLS model when making its "average"
        decision, whereas using the precision score allows us to understand
        when does the model decide that it's "sure" something is, say, a
        tumor nucleus.
        """
        for metric in ['F1', 'precision']:

            print(f'  Optimized for {metric}')

            # for each class, find the cluster (node) which best fits/explains
            # predictions from the NuCLS model (determined by metric of choice)
            best_nodes, best_stats = \
                self._get_best_node_for_each_class(metric=metric)

            # save decision tree traces for relevant classes
            kwargs = {
                'best_nodes': best_nodes,
                'postfix': f'_OptimizedFor{metric}',
            }
            self.save_dectree_traces(best_stats=best_stats, **kwargs)

            # visualize tree
            self.visualize_decision_tree_nodes(**kwargs)

            # color points associated with the best node for each class
            _ = self.visualize_decision_tree_classes(**kwargs)

    def plot_step_by_step_paths(self):

        # read precision traces
        with open(
                opj(self.savedir, f'dectree_traces_OptimizedForprecision.pkl'),
                'rb') as f:  # noqa
            traces = pickle.load(f)

        # for each class, plot one node at a time, excluding downstream nodes
        for cls in self.classes_list:
            savedir = opj(self.savedir, cls)
            maybe_mkdir(savedir)
            classes_list = [cls]
            exclude_idxs = []
            for nix, node in enumerate(traces['nodes'][cls][::-1]):
                excl = self.visualize_decision_tree_classes(
                    best_nodes={cls: node},
                    classes_list=classes_list,
                    restrict_to_pcateg=True,
                    exclude_leafs=exclude_idxs,
                    savedir=savedir,
                    postfix=f'_{cls}_nodeidx-{nix}({node})',
                )
                exclude_idxs.extend(excl)

    def run_sequence(self):
        """Main workflow."""

        print('DTALE: Fitting model ...')
        self.fit_model()
        self.apply_model()

        print('DTALE: Parsing tree ...')
        self.set_leafs_for_all_subtrees()
        # self.set_node_tally()

        print('DTALE: Saving and plotting optimized decision paths ...')
        self.save_and_plot_optimized_decision_paths()
        self.plot_step_by_step_paths()
Exemple #30
0
class CARTMethod(Method):
    def __init__(self,
                 dtype,
                 smoothing=False,
                 proper=False,
                 minibucket=5,
                 random_state=None,
                 *args,
                 **kwargs):
        self.dtype = dtype
        self.smoothing = smoothing
        self.proper = proper
        self.minibucket = minibucket
        self.random_state = random_state

        if self.dtype in CAT_COLS_DTYPES:
            self.cart = DecisionTreeClassifier(
                min_samples_leaf=self.minibucket,
                random_state=self.random_state)
        if self.dtype in NUM_COLS_DTYPES:
            self.cart = DecisionTreeRegressor(min_samples_leaf=self.minibucket,
                                              random_state=self.random_state)

    def fit(self, X_df, y_df):
        if self.proper:
            X_df, y_df = proper(X_df=X_df,
                                y_df=y_df,
                                random_state=self.random_state)

        X_df, y_df = self.prepare_dfs(X_df=X_df,
                                      y_df=y_df,
                                      normalise_num_cols=False,
                                      one_hot_cat_cols=True)
        if self.dtype in NUM_COLS_DTYPES:
            self.y_real_min, self.y_real_max = np.min(y_df), np.max(y_df)

        X = X_df.to_numpy()
        y = y_df.to_numpy()
        self.cart.fit(X, y)

        # save the y distribution wrt trained tree nodes
        leaves = self.cart.apply(X)
        leaves_y_df = pd.DataFrame({'leaves': leaves, 'y': y})
        self.leaves_y_dict = leaves_y_df.groupby('leaves').apply(
            lambda x: x.to_numpy()[:, -1]).to_dict()

    def predict(self, X_test_df):
        X_test_df, _ = self.prepare_dfs(X_df=X_test_df,
                                        normalise_num_cols=False,
                                        one_hot_cat_cols=True,
                                        fit=False)

        # predict the leaves and for each leaf randomly sample from the observed values
        X_test = X_test_df.to_numpy()
        leaves_pred = self.cart.apply(X_test)
        y_pred = np.zeros(len(leaves_pred), dtype=object)

        leaves_pred_index_df = pd.DataFrame({
            'leaves_pred': leaves_pred,
            'index': range(len(leaves_pred))
        })
        leaves_pred_index_dict = leaves_pred_index_df.groupby(
            'leaves_pred').apply(lambda x: x.to_numpy()[:, -1]).to_dict()
        for leaf, indices in leaves_pred_index_dict.items():
            y_pred[indices] = np.random.choice(self.leaves_y_dict[leaf],
                                               size=len(indices),
                                               replace=True)

        if self.smoothing and self.dtype in NUM_COLS_DTYPES:
            y_pred = smooth(self.dtype, y_pred, self.y_real_min,
                            self.y_real_max)

        return y_pred
Exemple #31
0
class DecisionTreeCounterfactual:
    """
    Counterfactual estimation using a decision tree.

    Given explanatory variables X, target variable y and treatment variable W, 
    this class implements an individual counterfactual estimation model. 
    We can break down the process in three steps:

    1 - model step) Fit a decision tree to X and y
    2 - comparison step) at each of the tree's leaves, compare W and y to determine the counterfactuals for the leaf 
    3 - prediction step) assign new samples to a leaf, and predict counterfactuals

    Parameters
    ----------

    model : object, optinal (default=None)

    Tree-based model which implements sklearn's API, particularly the .apply() method.
    Must be already configured.

    If None, model will be DecisionTreeRegressor(min_samples_leaf=100).

    min_sample_effect : int, optional (default=10)

    The minimum number of samples in a neighborhood to deem a counterfactual estimate valid, for a given W. 
    If there's less treated/untreated elements than min_sample_effect, the counterfactual will be NaN.

    save_explanatory : bool, optional (default=False)

    Save explanatory variables for explaining predictions. May cause large memory overhead.

    random_state : int, optional (default=None)

    If int, random_state is the seed used by the random number generator;
    If RandomState instance, random_state is the random number generator;
    If None, the random number generator is the RandomState instance used
    by `np.random`.
    
    """

    # initializing
    def __init__(self,
                 model=None,
                 min_sample_effect=10,
                 save_explanatory=False,
                 random_state=None):

        # storing model
        if model == None:
            self.model = DecisionTreeRegressor(min_samples_leaf=100)
        else:
            self.model = model

        # storing variables
        self.min_sample_effect = int(min_sample_effect)
        self.random_state = random_state
        self.save_explanatory = save_explanatory

    def _test_treatment_linear_discriminative_power(self, leaf_df):
        """
        Using data from elements on leaf, test if treatments are randomly assigned by using a linear model to predict it.

        Parameters
        ----------
        
        leaf_df : pd.DataFrame
        
        Training datafarme with features (X), treatment assignments (W) and target (y)

        Returns
        -------

        return : float

        Average AUC (if multiclass) of treatment assignment predictive model for leaf

        """

        # organizing and standardizing data for model
        W_leaf = leaf_df['W']
        X_leaf = leaf_df.drop(['W', 'y'], axis=1)
        X_leaf = StandardScaler().fit_transform(X_leaf)

        # fitting model
        lr = LogisticRegression(solver='lbfgs')
        lr.fit(X_leaf, W_leaf)

        # predicting
        W_predicted = lr.predict_proba(X_leaf)

        # if we have a single treatment treat as binary
        # classification problem, if not do nothing and
        # roc_auc_score function will take care of it
        if W_predicted.shape[1] == 2:
            W_predicted = W_predicted[:, 1]

        # computing score (avg. AUC)
        score = roc_auc_score(W_leaf,
                              W_predicted,
                              multi_class='ovr',
                              average='weighted')

        return score

    def _compute_treatment_confounding(self, filtered_train_df):
        """
        Apply tests to determine if treatments are randomly assigned for all leaves

        Parameters
        ----------
        
        filtered_train_df : pd.DataFrame
        
        Subset of training dataframe for elements on leaves that effects are valid (given min_sample_effect parameter)

        Returns
        -------

        confounding_df: pd.DataFrame

        Dataframe with confouding scores for each leaf

        """

        # just apply _test_treatment_linear_discriminative_power
        # for all leaves
        confounding_df = (filtered_train_df.groupby('leaf').apply(
            self._test_treatment_linear_discriminative_power).to_frame(
                name='confounding_score'))

        # using multi index to work in final dataframe
        confounding_df.columns = pd.MultiIndex.from_tuples([
            ('confounding_score', '')
        ])

        return confounding_df

    def _compute_leaf_counterfactuals(self, filtered_train_df):
        """
        Compute counterfactuals for each valid leaf

        Parameters
        ----------
        
        filtered_train_df : pd.DataFrame
        
        Subset of training dataframe for elements on leaves that effects are valid (given min_sample_effect parameter)

        Returns
        -------

        leaf_counterfactual_df : pd.DataFrame

        Dataframe with expected outcomes for each treatment

        """

        # computing avg outcomes for each treatment
        leaf_counterfactual_df = (filtered_train_df.pivot_table(
            values='y', columns='W',
            index='leaf').reset_index().set_index('leaf'))

        # fomatting column names
        leaf_counterfactual_df.columns = (pd.MultiIndex.from_product(
            [['avg_outcome'], leaf_counterfactual_df.columns],
            names=[None, 'W']))

        return leaf_counterfactual_df

    def _compute_feature_dispersion(self, train_df):
        """
        Computes feature dispersion between treatments in leaves, to help diagnosing if effects are valid

        Parameters
        ----------
        
        train_df : pd.DataFrame
        
        Training dataframe, as stored using the "save_explanatory=True" parameter

        Returns
        -------

        feat_dispersion : pd.DataFrame

        Difference in percentiles between elements with different treatment in each leaf.

        """

        # computing rank (percentiles) for each feature
        # and pivot by treatment to show user
        feat_percentiles_pivot = (train_df.set_index(['leaf', 'W']).drop(
            ['y'], axis=1).rank(pct=True).pivot_table(index='leaf',
                                                      columns='W').dropna())

        # putting levels to same column to match final output #
        # add prefix to first level
        level_0 = ('percentile_' +
                   feat_percentiles_pivot.columns.get_level_values(0))

        # second level stays the same
        level_1 = (feat_percentiles_pivot.columns.get_level_values(1))

        # applying to df
        feat_percentiles_pivot.columns = (pd.MultiIndex.from_arrays(
            [level_0, level_1]))

        return feat_percentiles_pivot

    # fit model
    def fit(self, X, W, y, verbose=0):
        """
        Get counterfactual estimates given explanatory variables X, treatment variable W and target y
        This method will fit a decision tree from X to y and store outcomes given distinct W values at each 
        of its leaves

        Parameters
        ----------
        
        X : array-like or sparse matrix of shape = [n_samples, n_features]
        
        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        W : array-like, shape = [n_samples] 

        Treatment variable. The model will try to estimate a counterfactual outcome for each unique value in this variable.
        Should not exceed 10 unique values.

        y: array-like, shape = [n_samples]
    
        Target variable. 

        verbose : int, optional (default=0)

        Verbosity level.

        Returns
        -------

        self: object

        """

        # checking if W has too many unique values
        if len(np.unique(W)) > 10:
            raise ValueError(
                'More than 10 unique values for W. Too many unique values will make the process very expensive.'
            )

        # fitting the model
        self.model.fit(X, y)

        # storing column names
        self.col_names = X.columns

        # saving explanatory variables, if applicable
        if self.save_explanatory:
            self.train_df = X.assign(leaf=self.model.apply(X), W=W, y=y)

        # initializing a df with counterfactuals for each leaf
        self.leaf_counterfactual_df = (pd.DataFrame({
            'leaf': self.model.apply(X),
            'y': y,
            'W': W
        }).assign(count=1).groupby(['leaf', 'W']).sum())

        # making estimates based on small samples invalid
        invalid_estimate_mask = (self.leaf_counterfactual_df['count'] <
                                 self.min_sample_effect)
        self.leaf_counterfactual_df.loc[invalid_estimate_mask, 'y'] = np.nan

        # correcting y by taking average
        self.leaf_counterfactual_df['y'] = (
            self.leaf_counterfactual_df['y'] /
            self.leaf_counterfactual_df['count'])

        # return self
        return self

    # method for predicting counterfactuals
    def predict(self, X, verbose=0):
        """
        Predict counterfactual outcomes for X. 
        This method runs new samples through the tree, and predicts counterfactuals
        given which leaf new samples ended up into

        Parameters
        ----------
        
        X : array-like or sparse matrix of shape = [n_samples, n_features]
        
        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        verbose : int, optional (default=0)

        Verbosity level.

        Returns
        -------
        
        counterfactual_df : pd.DataFrame

        Counterfactual outcomes per sample.

        """

        # getting decision tree cluster assignments
        leaves_score = pd.DataFrame({
            'leaf': self.model.apply(X),
            'id': X.index
        })

        # to get counterfactual df we just need to join leaves_test with leaf_counterfactual_df
        counterfactual_df = (leaves_score.merge(
            self.leaf_counterfactual_df.reset_index(),
            how='left').pivot(values='y', columns='W', index='id'))

        # correcting columns
        counterfactual_df.columns = (pd.MultiIndex.from_product(
            [
                ['y_hat'],
                counterfactual_df.columns,
            ], names=[None, 'W']))

        # returning counterfactual df
        return counterfactual_df

    # running CV for model parameters
    def get_cross_val_scores(self, X, y, scoring=None, verbose=0):
        """
        Estimate model generalization power with 5-fold CV.

        Parameters
        ----------
        
        X : array-like or sparse matrix of shape = [n_samples, n_features]
        
        Data with explanatory variables, with possible confounders of treatment assignment and effect.

        y: array-like, shape = [n_samples]

        Target variable. 
        
        scoring : string, callable or None, optional, default: None
        
        Scoring method for sklearn's cross_val_score function:

        A string (see model evaluation documentation) or
        a scorer callable object / function with signature
        ``scorer(estimator, X, y)`` which should return only
        a single value.

        Similar to :func:`cross_validate`
        but only a single metric is permitted.

        If None, the estimator's default scorer (if available) is used.
        
        verbose : int, optional (default=0)

        Verbosity level for sklearn's function cross_val_score.

        Returns
        -------
        
        scores : array of float, shape=(len(list(cv)),)
        Array of scores of the estimator for each run of the cross validation.
        
        """

        # CV method
        kf = KFold(n_splits=5, shuffle=True, random_state=self.random_state)

        # generating validation predictions
        scores = cross_val_score(self.model,
                                 X,
                                 y,
                                 cv=kf,
                                 scoring=scoring,
                                 verbose=verbose)

        # calculating result
        return scores

    def run_leaf_diagnostics(self):
        """
        Run leaf diagnostics, showing counfounding score, feature distribuitions and counterfactuals for each leaf.

        Returns
        -------

        leaf_diagnostics_df : pd.DataFrame

        Dataframe with leaf diagnostics

        """

        # first, we calculate only where effects are valid #

        # effect is invalid on leaves marked with nan
        # or leaves that only have one kind of assignment
        mask_nan = self.leaf_counterfactual_df['y'].isnull()
        mask_single_assignment = self.leaf_counterfactual_df.groupby(
            'leaf').size() == 1

        # joining masks and getting invalid leaves
        mask_invalid_effect = mask_nan | mask_single_assignment
        invalid_leaves = self.leaf_counterfactual_df.loc[
            mask_invalid_effect].index.get_level_values('leaf').values

        # filtering train df out of invalid leaves
        mask_invalid_leaves = self.train_df['leaf'].isin(invalid_leaves)
        filtered_train_df = self.train_df.loc[~mask_invalid_leaves]

        # then, we calculate quantities like #
        # counfounding, dispersion and counterfactuals #
        # for each leaf, so we can perform criticism #

        # computing discriminative power
        confounding_df = self._compute_treatment_confounding(filtered_train_df)

        # computing leaf effects
        leaf_counterfactual_df = self._compute_leaf_counterfactuals(
            filtered_train_df)

        # computing feature dispersion
        feat_percentiles_df = self._compute_feature_dispersion(self.train_df)

        # leaf diagnostics df
        dfs = [leaf_counterfactual_df, feat_percentiles_df, confounding_df]
        leaf_diagnostics_df = pd.concat(dfs,
                                        axis=1,
                                        join='inner',
                                        levels=[0, 1])

        return leaf_diagnostics_df

    # method for explaning predictions
    def explain(self, sample):
        """
        Explain predcitions of counterfactual outcomes for one sample. 
        This method shows diagnostics and comparables so you can trust
        and explain counterfactual predictions to others

        Parameters
        ----------
        
        sample : array-like or sparse matrix of shape = [1, n_features]
        
        Sample that you want to get explanations for

        Returns
        -------
        
        comparables_table : pd.DataFrame

        Table of comparable elements.

        """

        # checking which leaf sample is assigned to
        sample_leaf = self.model.apply(sample)

        # querying comparables
        if self.save_explanatory:
            comparables_table = (self.train_df.query(
                'leaf == {}'.format(sample_leaf)).drop('leaf', axis=1))
        else:
            raise ValueError(
                'Model did not store training samples to get explanations from. Setting save_explanatory=True will solve the issue'
            )

        # returning comparables table
        return comparables_table