def samples_per_leaf_node_ensemble(meta_m, m):

    path_out_tr = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/samples_leaf_node_ensemble_train.csv"
    path_out_te = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/samples_leaf_node_ensemble_test.csv"

    Y = m[:, 0]
    X = m[:, 1:]

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(X, Y, meta_m, train_size=0.60,
                                                                               random_state=0)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    n_estimators = [50]
    samples_per_leaf = [1000]

    header = ["ntree", "nnode", "tb_per_px"]
    with open(path_out_tr, "w", newline="") as wtr:
        with open(path_out_te, "w", newline="") as wte:
            writer_tr = csv.writer(wtr, delimiter=";")
            writer_tr.writerow(header)
            writer_te = csv.writer(wte, delimiter=";")
            writer_te.writerow(header)

            for spl in samples_per_leaf:
                for n_esti in n_estimators:
                    print()
                    print("Analysis: RF with Skewed Leaves")
                    print("Samples per leaf node: ", spl)
                    print("Number of estimators: ", n_esti)
                    print("-" * 50)

                    ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=True)
                    ensemble.fit(xtrain, ytrain)
                    leaves_train = ensemble.apply(xtrain)
                    dicori_train = samples_per_leaf_node(leaves_train, xtrain, ytrain)

                    l = []
                    for key in sorted(dicori_train.keys()):
                        for sam in dicori_train[key]:
                            l.append(sam[0])
                        newrow = [key[0], key[1]] + l
                        writer_tr.writerow(newrow)
                        l = []

                    leaves_test = ensemble.apply(xtest)
                    dicori_test = samples_per_leaf_node(leaves_test, xtest, ytest)

                    l = []
                    for key in sorted(dicori_test.keys()):
                        for sam in dicori_test[key]:
                            l.append(sam[0])
                        newrow = [key[0], key[1]] + l
                        writer_te.writerow(newrow)
                        l = []
Beispiel #2
0
class stack_model(BaseEstimator):
    
    def __init__(self, cols1 = None, cols2 = None):
        self.rf_stack = RandomForestRegressor(n_estimators=100, max_features=None, max_depth = 5, min_impurity_decrease = 0.0,  min_samples_split = 10, min_samples_leaf=10, bootstrap=True, random_state=42)
        self.lm_stack = Lasso(alpha=0.001, normalize=True, max_iter=1000, random_state=42)
        self.cols1 = cols1
        self.cols2 = cols2
        self.lm_models = {}
    
    def get_params(self, deep=True):
        return {'cols1' : self.cols1, 'cols2' : self.cols2}

    def set_params(self, **parameters):
        self.cols1 = parameters['cols1']
        self.cols2 = parameters['cols2']        
        return self

    def fit(self, df, y):
        
        cols1 = list(df.columns) if self.cols1 is None else self.cols1
        cols2 = list(df.columns) if self.cols2 is None else self.cols2
            
        self.rf_stack.fit(df[cols1],y)
        leaf = self.rf_stack.apply(df[cols1])
        
        #one lm model for every rf estimator and leaf
        for f_idx in range(leaf.shape[1]):
            for leaf_num, idxs in pd.DataFrame(leaf[:,f_idx]).reset_index().groupby(0):
                idxs = idxs['index'].values
                df_leaf = df[cols2].iloc[idxs].copy()
                y_leaf = y.iloc[idxs].copy()
                lm_model = clone(self.lm_stack)
                lm_model.fit(df_leaf,y_leaf)
                self.lm_models[(f_idx, leaf_num)] = lm_model
                
        return self
    
    def predict(self, df):
        
        cols1 = list(df.columns) if self.cols1 is None else self.cols1
        cols2 = list(df.columns) if self.cols2 is None else self.cols2

        leaf = self.rf_stack.apply(df[cols1])
        stack_preds = np.zeros_like(leaf, dtype=float)
        
        #predict unsing lm models for every rf estimator and leaf
        for f_idx in range(leaf.shape[1]):
            for leaf_num, idxs in pd.DataFrame(leaf[:,f_idx]).reset_index().groupby(0):
                idxs = idxs['index'].values
                df_leaf = df[cols2].iloc[idxs].copy()
                lm_model = self.lm_models[(f_idx, leaf_num)]
                leaf_pred = lm_model.predict(df_leaf)
                stack_preds[idxs,f_idx] = leaf_pred
        
        y_pred = stack_preds.mean(axis = 1)
        return y_pred
        
#check_estimator(stack_model)
Beispiel #3
0
    def _data_clusterings(self, data_z, data_p, data_y):
        ''' Returns the centers and precisions of an epsilon cover of gaussians.
        Currently the centers are just an epsilon grid and the precisions 1/(3*epsilon),
        i.e. the standard deviation is 3 times the distance between two grid points.
        Later this is exactly the function that will be implementing the tree style splitting.
        and returning a more tailored to the data epsilon cover.'''
        if self._cluster_type == 'forest':
            from sklearn.ensemble import RandomForestRegressor
            dtree = RandomForestRegressor(n_estimators=self._num_trees, max_leaf_nodes=self._n_critics, min_samples_leaf=self._min_cluster_size)
            dtree.fit(data_z, data_p)
            cluster_labels = dtree.apply(data_z)
            #dtree.fit(data_z, data_y)
            #cluster_labels = np.concatenate((cluster_labels, dtree.apply(data_z)), axis=1)
            cluster_ids = [np.unique(cluster_labels[:, c]) for c in range(cluster_labels.shape[1])]
        elif self._cluster_type == 'kmeans':
            from sklearn.cluster import KMeans
            kmeans = KMeans(n_clusters=self._n_critics).fit(data_z)
            cluster_labels = kmeans.labels_.reshape(-1, 1)
            cluster_ids = [np.unique(cluster_labels)]
        elif self._cluster_type == 'random_points':
            center_ids = np.random.choice(np.arange(data_z.shape[0]), size=self._n_critics, replace=False)
            cluster_labels = np.zeros((data_z.shape[0], self._n_critics))
            cluster_ids = np.ones((self._n_critics, 1))
            for it, center in enumerate(center_ids):
                distances = np.linalg.norm(data_z - data_z[center], axis=1)
                cluster_members = np.argsort(distances)[:self._min_cluster_size]
                cluster_labels[cluster_members, it] = 1
        else:
            raise Exception("Unknown option {}".format(self._cluster_type))

        #z_min = np.percentile(data_z, 0) - self._epsilon
        #z_max = np.percentile(data_z, 100) + self._epsilon
        #center_grid = np.arange(z_min, z_max, self._epsilon)
        #precision_grid = np.ones(center_grid.shape[0]) / (3 * self._epsilon)
        return cluster_labels, cluster_ids
def test_varying_samples_per_node(meta_m, m):
    print("Type m: ", m.dtype)
    path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_ytest.csv"

    Y = m[:, 0]
    X = m[:, 1:]

    # Ynz, Xnz = trim_value(Y, X, 0)

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
        X, Y, meta_m, train_size=0.60, random_state=42)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    n_estimators = [1, 5, 10, 50, 100]
    samples_per_leaf = range(100, 1600, 100)

    start_all = time.time()
    for spl in samples_per_leaf:
        start_it = time.time()
        for n_esti in n_estimators:

            print()
            print("Analysis: RF with Poisson Leaves")
            print("Samples per leaf node: ", spl)
            print("Number of estimators: ", n_esti)
            print("-" * 50)

            ensemble = RandomForestRegressor(n_estimators=n_esti,
                                             min_samples_leaf=spl,
                                             bootstrap=False)

            ensemble.fit(xtrain, ytrain)

            leaves = ensemble.apply(xtrain)

            dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

            pack = fitting_four_models_leaf_nodes(dicori)

            pred_rf = ensemble.predict(xtest)

            pred = predicting_four_models_leaf_nodes(spl, n_esti, ytest, xtest,
                                                     pack, pred_rf).T

            stack = np.hstack((meta_m_test, pred))

            dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

            stop_it = time.time()
            print("--- Iteration elapsed {0} minutes ---".format(
                np.divide(stop_it - start_it, 60)))

    end_all = time.time()
    print("--- Full program elapsed {0} hours ---".format(
        np.divide(end_all - start_all, 3600)))
def test_drf_regressor_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True)
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.RandomForestRegressor

    #Run h2o4gpu version of RandomForest Regression
    drf = Solver(backend=backend, random_state=1234, oob_score=True)
    print("h2o4gpu fit()")
    drf.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import RandomForestRegressor
    drf_sk = RandomForestRegressor(random_state=1234,
                                   oob_score=True,
                                   max_depth=3)
    print("Scikit fit()")
    drf_sk.fit(X, y)

    if backend == "sklearn":
        assert (drf.predict(X) == drf_sk.predict(X)).all() == True
        assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True
        assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]
                ).all() == True
        assert (drf.apply(X) == drf_sk.apply(X)).all() == True

        print("Estimators")
        print(drf.estimators_)
        print(drf_sk.estimators_)

        print("n_features")
        print(drf.n_features_)
        print(drf_sk.n_features_)
        assert drf.n_features_ == drf_sk.n_features_

        print("n_outputs")
        print(drf.n_outputs_)
        print(drf_sk.n_outputs_)
        assert drf.n_outputs_ == drf_sk.n_outputs_

        print("Feature importance")
        print(drf.feature_importances_)
        print(drf_sk.feature_importances_)
        assert (drf.feature_importances_ == drf_sk.feature_importances_
                ).all() == True

        print("oob_score")
        print(drf.oob_score_)
        print(drf_sk.oob_score_)
        assert drf.oob_score_ == drf_sk.oob_score_

        print("oob_prediction")
        print(drf.oob_prediction_)
        print(drf_sk.oob_prediction_)
        assert (drf.oob_prediction_ == drf_sk.oob_prediction_).all() == True
Beispiel #6
0
def prox_matrix(df, y, features, cluster_dimension, trees=10):
    #https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#prox

    #initialize datframe for independant variables
    independant = pd.DataFrame()

    #Handle Categoricals: This should really be added to RandomForestRegressor
    for column, data_type in df[features].dtypes.iteritems():
        try:
            independant[column] = pd.to_numeric(df[column], downcast='integer')
        except ValueError:
            contains_nulls = df[column].isnull().values.any()
            dummies = pd.get_dummies(df[column],
                                     prefix=column,
                                     dummy_na=contains_nulls,
                                     drop_first=True)
            independant[dummies.columns] = dummies

    if len(independant.index) != len(df.index):
        raise Exception('independant variables not stored properly')

    #train Model
    clf = RandomForestRegressor(n_estimators=trees, n_jobs=-1)
    clf.fit(independant, y)

    #Final leaf for each tree
    leaves = clf.apply(independant)
    #value in cluster dimension
    labels = df[cluster_dimension].values

    numerator_matrix = {}
    for i, value_i in enumerate(labels):
        for j, value_j in enumerate(labels):
            if i >= j:
                numerator_matrix[(value_i, value_j)] = numerator_matrix.get(
                    (value_i, value_j),
                    0) + np.count_nonzero(leaves[i] == leaves[j])
                numerator_matrix[(value_j,
                                  value_i)] = numerator_matrix[(value_i,
                                                                value_j)]

    #normalize by the total number of possible matchnig leaves
    prox_matrix = {
        key: 1.0 - float(x) / (trees * np.count_nonzero(labels == key[0]) *
                               np.count_nonzero(labels == key[1]))
        for key, x in numerator_matrix.items()
    }

    #make sorted dataframe
    levels = np.unique(labels)
    D = pd.DataFrame(data=[[prox_matrix[(i, j)] for i in levels]
                           for j in levels],
                     index=levels,
                     columns=levels)

    return D
def GetBinFeatures(stage, train_imgs, train_shapes, train_bboxes, mean_rshape, targets):
    bin_features=[]
    forests=[]
    random_poses=[]
    for ilandmark in range(param_landmark_num):
        t1=time.time()
        #### get locations
        feature_pair_pos=np.zeros((param_local_feature_num[stage]*2, 2))
        for i in range(param_local_feature_num[stage]):
            while True:
                pair=np.random.rand(4)*2-1
                x1,y1,x2,y2=pair
                if x1*x1+y1*y1<1 and x2*x2+y2*y2<1 and (x1, y1)!=(x2, y2):
                    break
            feature_pair_pos[2*i:2*i+2]=(pair*param_local_radius[stage]).reshape((2,2))
        random_poses.append(feature_pair_pos)
        #### get pixel difference
        features=np.zeros((len(train_shapes), param_local_feature_num[stage]))
        for i in range(len(train_shapes)):
            #origin_img=cv2.imread(train_imgs[i], 0).astype(np.float)
            origin_img=train_imgs[i]
            # transform from mean space to current training space
            sim_trans=transform.estimate_transform('similarity', CenterShape(mean_rshape), CenterShape(Shape2Relative(train_shapes[i], train_bboxes[i])))
            #trans_feature_pair_pos=Shape2Absolute(sim_trans(feature_pair_pos), train_bboxes[i])+train_shapes[i][ilandmark]
            trans_feature_pair_pos=GetLocalFeatureAbsolutePos(sim_trans(feature_pair_pos), train_bboxes[i], train_shapes[i][ilandmark]).astype(np.int)
            #trans_feature_pair_pos=trans_feature_pair_pos.astype(np.int)
            for j in range(param_local_feature_num[stage]):
                x1,y1=trans_feature_pair_pos[2*j]
                x2,y2=trans_feature_pair_pos[2*j+1]
                # in case out of boundary
                x1=max(0, min(origin_img.shape[1]-1, x1))
                x2=max(0, min(origin_img.shape[1]-1, x2))
                y1=max(0, min(origin_img.shape[0]-1, y1))
                y2=max(0, min(origin_img.shape[0]-1, y2))
                features[i,j]=origin_img[y1,x1] - origin_img[y2,x2]
            #del origin_img
            #gc.collect()
        #### train random forest
        forest=RandomForestRegressor(max_depth=param_tree_depth, n_estimators=param_tree_num, n_jobs=8)
        forest.fit(features, targets[:, ilandmark])
        forests.append(forest)
        #### extract binary features for every training sample
        leaves, leaves_num=GetLeaves(forest)
        reach_nodes=forest.apply(features)
        landmark_bin_features=np.zeros((len(train_shapes), leaves_num))
        for i in range(len(train_shapes)):
            begin_leaf_ind=0
            for j in range(len(leaves)):
                node=reach_nodes[i, j]
                landmark_bin_features[i][begin_leaf_ind+leaves[j][node]]=1
                begin_leaf_ind+=len(leaves[j])
        bin_features.append(landmark_bin_features)
        print('landmark:', ilandmark+1, 'use:', time.time()-t1, 's')
    return np.hstack(bin_features), forests, random_poses
Beispiel #8
0
def predict_models(meta_m, m, meta_p, p):
    print("Type m: ", m.dtype)
    path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_nl_four_models.csv"

    Y = m[:, 0]
    X = m[:, 1:]

    n_esti = 5
    spl = 800

    # Ynz, Xnz = trim_value(Y, X, 0)

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
        X, Y, meta_m, train_size=0.60, random_state=0)

    ensemble = RandomForestRegressor(n_estimators=n_esti,
                                     min_samples_leaf=spl,
                                     bootstrap=False)

    ensemble.fit(xtrain, ytrain)

    leaves = ensemble.apply(xtrain)

    dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

    pack = fitting_four_models_leaf_nodes(dicori)

    print("Weirdos?")
    print(np.isnan(p).any(), np.isinf(p).any(), np.isneginf(p).any())

    # pred = predicting_four_models_leaf_nodes(spl, n_esti, ytest, xtest, pack, pred_rf).T

    print("Prediction with the four models")
    pred = predicting_four_models_leaf_nodes_nl(p, pack)

    print("Predicting with random forest")
    pred_rf = ensemble.predict(p).reshape(-1, 1)

    print(meta_p.shape, pred.T.shape, pred_rf.shape)

    stack = np.hstack((meta_p, pred.T, pred_rf))

    print(stack.shape, meta_p.shape, pred.shape)

    # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

    with open(path_out, "w", newline="") as w:
        writer = csv.writer(w, delimiter=";")
        for item in stack:
            writer.writerow(item)

    placed_list = place(stack)
    write_tif(placed_list, spl, n_esti)
def test_drf_regressor_backupsklearn(backend='auto'):
    df = pd.read_csv("./open_data/simple.txt", delim_whitespace=True)
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    import h2o4gpu
    Solver = h2o4gpu.RandomForestRegressor

    #Run h2o4gpu version of RandomForest Regression
    drf = Solver(backend=backend, random_state=1234, oob_score=True)
    print("h2o4gpu fit()")
    drf.fit(X, y)

    #Run Sklearn version of RandomForest Regression
    from sklearn.ensemble import RandomForestRegressor
    drf_sk = RandomForestRegressor(random_state=1234, oob_score=True, max_depth=3)
    print("Scikit fit()")
    drf_sk.fit(X, y)

    if backend == "sklearn":
        assert (drf.predict(X) == drf_sk.predict(X)).all() == True
        assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True
        assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]).all() == True
        assert (drf.apply(X) == drf_sk.apply(X)).all() == True

        print("Estimators")
        print(drf.estimators_)
        print(drf_sk.estimators_)

        print("n_features")
        print(drf.n_features_)
        print(drf_sk.n_features_)
        assert drf.n_features_ == drf_sk.n_features_

        print("n_outputs")
        print(drf.n_outputs_)
        print(drf_sk.n_outputs_)
        assert drf.n_outputs_ == drf_sk.n_outputs_

        print("Feature importance")
        print(drf.feature_importances_)
        print(drf_sk.feature_importances_)
        assert (drf.feature_importances_ == drf_sk.feature_importances_).all() == True

        print("oob_score")
        print(drf.oob_score_)
        print(drf_sk.oob_score_)
        assert drf.oob_score_ == drf_sk.oob_score_

        print("oob_prediction")
        print(drf.oob_prediction_)
        print(drf_sk.oob_prediction_)
        assert (drf.oob_prediction_ == drf_sk.oob_prediction_).all() == True
def predict_models(meta_m, m, meta_p, p):
    print("Type m: ", m.dtype)
    path_out = r"D:/UTwente/PycharmProjects/04_Risk_Model/data/skewed_leaves/prediction_nl_four_models_v3_20T_200S.csv"

    Y = m[:, 0]
    X = m[:, 1:]

    n_esti = 20
    spl = 200

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(X, Y, meta_m, train_size=0.60, random_state=0)

    ensemble = RandomForestRegressor(n_estimators=n_esti, min_samples_leaf=spl, bootstrap=False)

    ensemble.fit(xtrain, ytrain)

    leaves = ensemble.apply(xtrain)

    dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

    pack = fitting_four_models_leaf_nodes(dicori)

    print("Predicting with random forest")
    pred_rf = ensemble.predict(p).reshape(-1, 1)

    print("Prediction with the four models")

    pred_sk = predicting_four_models_leaf_nodes_NL(ensemble, meta_p, p, pack).T

    print(meta_p.shape, pred_sk.shape, pred_rf.shape)

    stack = np.hstack((meta_p, pred_sk, pred_rf))

    print("Stacked predictions: ", stack.shape, meta_p.shape)

    # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

    # print(meta_m_test.shape, ytest.shape)
    # stack = np.hstack((meta_m, Y.reshape(-1, 1)))

    with open(path_out, "w", newline="") as w:
        writer = csv.writer(w, delimiter=";")
        for item in stack:
            writer.writerow(item)
Beispiel #11
0
    def _get_fitted_model(self, X, y):
        model = RandomForestRegressor(
            criterion=self.criterion,
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            bootstrap=self.bootstrap,
            oob_score=self.oob_score,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            verbose=self.verbose,
            warm_start=self.warm_start,
            ccp_alpha=self.ccp_alpha,
            max_samples=self.max_samples)

        self.model_ = model.fit(X, y)
        self.train_leaf_indices_ = model.apply(X)
Beispiel #12
0
def test_varying_samples_per_node(meta_m, m):
    print("Type m: ", m.dtype)

    Y = m[:,0]
    X = m[:,1:]

    # Ynz, Xnz = trim_value(Y, X, 0)

    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.60, random_state=0)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    for samples_per_leaf in range(500, 600, 100):

        print("Samples per leaf node: ", samples_per_leaf)

        ensemble = RandomForestRegressor(n_estimators=1, min_samples_leaf=samples_per_leaf, bootstrap=False)

        ensemble.fit(xtrain, ytrain)

        leaves = ensemble.apply(xtrain)

        print(leaves)

        print(leaves.shape)

        dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

        pack = fitting_four_models_leaf_nodes(dicori)

        pred_ = predicting_four_models_leaf_nodes(xtest, pack)

        # dicens = ensemble_predictions_leaf_nodes(dicori)

        break
    def predict_one(self, X0):
        '''
        X0 must be a array of shape 1 x n_features 
        '''
        assert X0.shape == (
            1, self.n_features), "The shape of X0 should be 1 x n_features"
        predict_one_leaf_indices = RandomForestRegressor.apply(
            self, X0)  ## 1 x n_estimators
        leaf_equal_bool = np.equal(
            self.leaf_indices,
            predict_one_leaf_indices)  ## n_sample x n_estimator
        leaf_count = np.sum(leaf_equal_bool,
                            axis=0).reshape(1, -1)  ## 1 x n_estimators
        alpha_weights = 1 / self.B * np.sum(
            leaf_equal_bool.astype(float) / leaf_count,
            axis=1)  ## n_sample x 1
        # print(leaf_equal_bool.shape, leaf_equal_bool)
        # print(leaf_count.shape, leaf_count)
        # print(alpha_weights, np.sum(alpha_weights))
        # print(alpha_weights.shape, np.sum(alpha_weights))
        assert abs(np.sum(alpha_weights) -
                   1) < 0.01, "alpha weights calculation is wrong"

        ## A diagonal matrix
        A = np.diag(alpha_weights)  ## n_sample x n_sample
        J_1d = np.ones((self.n_features + 1, 1))
        J_1d[0] = 0
        J = np.diag(J_1d)  ## n_features + 1 x n_features + 1
        delta_m = np.ones((self.n_samples,
                           self.n_features + 1))  ## n_sample x n_features + 1
        delta_m[:, 1:] = self.train_x - X0
        local_mu_theta = np.linalg.inv(delta_m.T @ A @ delta_m + self.lam *
                                       J) @ delta_m.T @ A @ self.train_y
        mu = local_mu_theta[0]
        theta = local_mu_theta[1:]
        return mu, theta
X1, X2 = np.meshgrid(x1, x2)
R1 = X1 - X2
R2 = X1 + X2
Z = 20 * np.maximum.reduce([np.exp(-2 * R1 ** 2),
                       np.exp(-1 * R2 ** 2),
                       2 * np.exp(-0.5 * (X1 ** 2 + X2 ** 2))])

fig, axes = plt.subplots(ncols=4, figsize=(18, 6))

for ax in axes.flat:
    ax.set_aspect('equal', 'box')
    ax.set_xlim(-3, 3)
    ax.set_ylim(-3, 3)

rf_kernel = 1 - pairwise_distances(
    forest.apply([[-1.5, 1.5]]), forest.apply(X), metric='hamming')
rf_kernel = rf_kernel.ravel() / rf_kernel.ravel().sum()

axes[0].imshow(
    Z, extent=[-3, 3, -3, 3], origin='lower', cmap='YlGnBu_r', alpha=0.5)
axes[0].contour(X1, X2, Z, levels=n_contours,
            linewidths=0.5, colors='k', linestyles='--')
axes[0].scatter(X[:, 0], X[:, 1],
            edgecolor='k',
            color='white',
            sizes=50 * np.sqrt(rf_kernel))
axes[0].scatter(-1.5, 1.5, color='tomato', edgecolor='black', marker='P', s=50)
axes[0].set_title("Random Forest", fontsize=fontsize)

rf_kernel = 1 - pairwise_distances(
    forest.apply([[0.5, -0.5]]), forest.apply(X), metric='hamming')
def test_varying_samples_per_node(meta_m, m):
    print("Type m: ", m.dtype)

    Y = m[:, 0]
    X = m[:, 1:]

    Ynz, Xnz, meta_m_nz = trim_value(Y, X, meta_m, 0)

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
        X, Y, meta_m, train_size=0.60, random_state=0)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    # n_estimators = [1, 5, 10, 50, 100]
    # samples_per_leaf = range(100, 1600, 100)

    n_estimators = [10]
    samples_per_leaf = [200, 400, 600, 1000, 1200]

    start_all = time.time()

    fig, ax = plt.subplots(nrows=5, ncols=5)

    nrow = 0

    for spl in samples_per_leaf:
        start_it = time.time()
        for n_esti in n_estimators:

            print()
            print("Analysis: RF with Skewed Leaves")
            print("Samples per leaf node: ", spl)
            print("Number of estimators: ", n_esti)
            print("-" * 50)

            ensemble = RandomForestRegressor(n_estimators=n_esti,
                                             min_samples_leaf=spl,
                                             bootstrap=True)

            ensemble.fit(xtrain, ytrain)

            leaves = ensemble.apply(xtrain)

            dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

            pack = fitting_four_models_leaf_nodes(dicori)

            pred_rf = ensemble.predict(xtest)

            pred_sk = testing_four_models_leaf_nodes_v2(
                ensemble, spl, n_esti, ytest, xtest, pack, pred_rf,
                meta_m_test).T

            print("This is pred sk: ", pred_sk.shape)

            # stack = np.hstack((meta_m_test, pred))

            # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

            write_proportion_of_zeros(spl, n_esti)

            # plt.subplot(2, 2, 1)
            # plt.hist(pred_sk[:, 0], bins=20)
            # plt.subplot(2, 2, 2)
            # plt.hist(pred_sk[:, 1], bins=20)
            # plt.subplot(2, 2, 3)
            # plt.hist(pred_sk[:, 2], bins=20)
            # plt.subplot(2, 2, 4)
            # plt.hist(pred_sk[:, 3], bins=20)
            # plt.show()

            plot_compare_histograms(ax, nrow, spl, ytest, pred_sk, pred_rf)

            nrow += 1

            stop_it = time.time()
            print("--- Iteration elapsed {0} minutes ---".format(
                np.divide(stop_it - start_it, 60)))

    plt.show()
    end_all = time.time()
    print("--- Full program elapsed {0} hours ---".format(
        np.divide(end_all - start_all, 3600)))
Beispiel #16
0
estimators = regr.estimators_  # description of each tree

importance = regr.feature_importances_  # an array of the fractional importance of the each feature

num_features = regr.n_features_  # the number of features

num_outputs = regr.n_outputs_  # the number of outputs when the model is built

#oob_score = regr.oob_score_ # score the training dataset using an out-of-bag estimator, this computes the average of correct classifications
# basically the coefficent of determination of R**2 using 'unseen' data not used to build the model

#oob_predict = regr.oob_prediction_ # The prediction for the values of training dataset using the oob method

# now having a look at the methods
leaf_indices = regr.apply(
    x_test
)  # get the numbers of the all the leaves the test dataset ends up in

decision_path = regr.decision_path(x_test)

parameters = regr.get_params()  # the parameters of the model

predicted_age_array = regr.predict(
    x_test
)  # running the test dataset through the model, giving an array of predicted values

r_2_train = regr.score(
    x_train, y_train)  # calculating the R squared of the train dataset
r_2_test = regr.score(x_test,
                      y_test)  # calculating the R squared of the test dataset
class PDFRandomForestRegressor(BaseEstimator, RegressorMixin):
    """A normal random forest, except that it stores the final leaf positions and delay times for each row of the training set. It will also have a specialized scoring method."""
    def __init__(self,delaymin,delaymax,**kwargs):
        self.rforest = RandomForestRegressor(**kwargs)
        self.delay_min = delaymin
        self.delay_max = delaymax
        self.delay_bin_indices = np.arange(self.delay_max-self.delay_min+1)
        self.delay_bin_values = np.arange(self.delay_min,self.delay_max+1)
        #For each random forest, a dictionary mapping node id numbers to numpy arrays is also stored. These numpy arrays contain a histogram of the number of training models which fell into that node and their delay times.
        self.node_delay_pdfs = [{}]*self.rforest.n_estimators

    def fit(self, X,y,compute_pdf = False):
        y_fit = self.restrict_range(y)
        self.rforest.fit(X,y_fit)

        if compute_pdf == True:
            #Get the node ids for the training set:
            self.set_node_pdfs(X,y_fit)
        
        return self

    def set_node_pdfs(self,X,y):
        y_fit = self.restrict_range(y)
        #Map the y values onto indices for the arrays:
        y_indices = self.map_y_vals(y_fit)
        nodes = self.apply(X)

        #For each tree, make a 2D array containing the full range of integer target values along one axis (first axis), and the unique nodes along the other. Now, when the regression predicts a set of nodes for a given set of inputs, the full delay time distribution can be extracted by taking a slice along the unique node axis
        for i in range(nodes.shape[1]):
            unique_nodes,idxes = np.unique(nodes[:,i],return_inverse=True)
            unique_node_indices = np.arange(len(unique_nodes)+1)
            node_dict = {unique_nodes[i]:unique_node_indices[i] for i in range(len(unique_node_indices)-1)}
            node_indices = unique_node_indices[idxes]
            pdf_arr,xedges,yedges = np.histogram2d(y_fit,node_indices,bins=[self.delay_bin_values,unique_node_indices])
            #print 'testing',np.sum(pdf_arr)
            self.node_delay_pdfs[i] = {'node_dict':node_dict,'pdf_arr':pdf_arr}

    def restrict_range(self,y):
        y_restrict = y.copy()
        y_restrict[y < self.delay_min] = self.delay_min
        y_restrict[y > self.delay_max-1] = self.delay_max-1
        return y_restrict
    
    def map_y_vals(self,y):
        y_map = self.restrict_range(y)
        y_indices = y_map-self.delay_min
        return y_indices

    def predict(self,X):
        return self.rforest.predict(X)

    #Instead of just the normal prediction, which I believe just gives the average value of everything in the leaf node, predict a set of quantiles:
    def predict_percentiles(self,X,percentiles):
        p_nodes = self.apply(X)
        pdf_arr = self.get_node_pdfs(p_nodes)
        #print np.sum(pdf_arr,axis=1)
        sys.exit
        cdf_arr = np.cumsum(pdf_arr,axis=1)
        cdf_arr_frac = (cdf_arr.T/cdf_arr[:,-1].astype(np.float)).T
        #print pdf_arr[0,:]
        #print cdf_arr_frac[0,:]
        #sys.exit(1)
        #print "test",cdf_arr_frac.shape,len(percentiles)
        percentile_yvals = np.zeros((cdf_arr_frac.shape[0],len(percentiles)),dtype=np.int)
        for i,ptile in enumerate(percentiles):
            temp_cdf_arr_frac = cdf_arr_frac.copy()#These steps ensure that the y value is taken as the first index where the cdf goes above the percentile
            temp_cdf_arr_frac[temp_cdf_arr_frac < ptile/100.] = 1000
            indices = np.argmin(temp_cdf_arr_frac-ptile/100.,axis=1)
            #indices = np.argmin(np.abs(cdf_arr_frac-ptile/100.),axis=1)
            #print indices[0]
            percentile_yvals[:,i] = self.delay_bin_values[indices]
            #print i,self.delay_bin_values[indices]

        #print pdf_arr[0,:]
        #print cdf_arr_frac[0,:]
        #print percentile_yvals[0,:],percentile_yvals.shape
        #sys.exit(1)
        return percentile_yvals

    def compute_percentiles(self,X,y):
        y_fit = self.restrict_range(y)
        y_indices = self.map_y_vals(y_fit).astype(np.int)
        p_nodes = self.apply(X)
        pdf_arr = self.get_node_pdfs(p_nodes)
        cdf_arr = np.cumsum(pdf_arr,axis=1)
        cdf_arr_frac = (cdf_arr.T/cdf_arr[:,-1].astype(np.float)).T
        #print cdf_arr_frac[0,:]
        #print self.delay_bin_values
        #print cdf_arr_frac.shape,y_fit.shape,y_fit[0]
        #print 'test',cdf_arr_frac.shape,y_fit.shape,y_indices.min(),y_indices.max(),self.delay_bin_values.shape
        #Now just need to compute the percentiles for all the y_indices
        cdf_at_y = cdf_arr_frac[np.arange(len(y_indices)),y_indices]
        #print "debug",cdf_at_y[0]
        return cdf_at_y
        # print cdf_at_y[:10],cdf_at_y.shape

    def get_node_pdfs(self,nodes):
        pdf_arr = np.zeros((nodes.shape[0],len(self.delay_bin_values)-1),dtype=np.int)
        #print nodes.shape,pdf_arr.shape
        for i,node_info in enumerate(self.node_delay_pdfs):
            #print i,node_info['node_dict']
            node_ids = [node_info['node_dict'][node] for node in nodes[:,i]]
            #print node_ids
            #print node_info['pdf_arr'].shape
            #print 'debug',node_info['pdf_arr'][:,node_ids[0]]
            temp_arr = np.array([node_info['pdf_arr'][:,node_id] for node_id in node_ids],dtype=pdf_arr.dtype)
            pdf_arr += temp_arr
            #print 'temp',temp_arr[0,:]
            #print ""
        return pdf_arr

    def apply(self,X):
        return self.rforest.apply(X)

    def score(self,X,y):
        return self.rforest.score(X,y)

    #Compute how good each predicted value is based on how far away it is from the median value in percentiles:
    def score_percentiles(self,X,y):
        #First, compute the medians:
        y_med = self.predict_percentiles(X,[50]).ravel()
        #print y_med.shape
        percentiles = self.compute_percentiles(X,y)
        med_percentiles = self.compute_percentiles(X,y_med)#Have to do this step to take into account the discrete nature of the y values.
        #print med_percentiles[:10],percentiles[:10]
        return 1.-np.sum((med_percentiles-percentiles)**2)/float(len(y))
def predict_models(meta_m, m, meta_p, p):
    path_out_tmp = r"D:/PycharmProjects/IGM_PhD_Materials/data/P04/out/pred_csv/prediction_nl_four_models_{0}T_{1}S.csv"

    Y = m[:, 0]
    X = m[:, 1:]

    n_estimators = [10, 20, 50]
    samples_per_leaf = range(100, 900, 100)

    # The complete experiment for this paper corresponds to:
    #           n_estimators = [10, 20, 50]
    #           samples_per_leaf = range(100, 900, 100)
    #
    # NOTE THAT:
    #   + The complete execution takes  ~40h in a Inteli7-8700 CPU @ 3.20GHz, 6 Core(s), 12 Logical Processor(s) w/ 16GB of RAM
    #   + The paper only shows results for SPL = [100, 200, 400, 600, 800] due to space constraints

    start_all = time.time()
    for n_esti in n_estimators:
        start_it = time.time()
        for spl in samples_per_leaf:

            path_out = path_out_tmp.format(n_esti, spl)

            print("\nTraining ensemble: ({0} T, {1} SPL)".format(n_esti, spl))

            xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
                X, Y, meta_m, train_size=0.60, random_state=0)

            ensemble = RandomForestRegressor(n_estimators=n_esti,
                                             min_samples_leaf=spl,
                                             bootstrap=False)

            ensemble.fit(xtrain, ytrain)

            leaves = ensemble.apply(xtrain)

            dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

            pack = fitting_four_models_leaf_nodes(dicori)

            print("\tPredicting with random forest")
            pred_rf = ensemble.predict(p).reshape(-1, 1)

            print("\tPrediction with the four models")

            pred_sk = predicting_four_models_leaf_nodes_NL(
                ensemble, meta_p, p, pack, n_esti).T

            stack = np.hstack((meta_p, pred_sk, pred_rf))

            # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

            print("\tWriting results in CSV file")
            with open(path_out, "w", newline="") as w:
                writer = csv.writer(w, delimiter=";")
                for item in stack:
                    writer.writerow(item)

        stop_it = time.time()
        print("--- Iteration elapsed {0} minutes ---".format(
            np.divide(stop_it - start_it, 60)))
        print()

    end_all = time.time()
    print("--- Full program elapsed {0} hours ---".format(
        np.divide(end_all - start_all, 3600)))
# In[42]:

print prediction
print bias + np.sum(contributions, axis=1)


# In[43]:

#  the basic feature importance feature provided by sklearn
fit1.feature_importances_


# In[44]:

# treeinterpreter uses the apply function to retrieve the leave indicies with the help of which, 
# the tree path is retrieved

rf.apply


# In[47]:

rf.apply(instances)


# In[ ]:



Beispiel #20
0
def get_data_clustering(data_z,
                        data_p,
                        n_instruments,
                        n_critics=50,
                        cluster_type="kmeans",
                        num_trees=5,
                        min_cluster_size=50,
                        critic_type="Gaussian"):
    """Return the centers, precisions, and normalizers of a data cover.
    """
    if cluster_type == "forest":
        from sklearn.ensemble import RandomForestRegressor
        dtree = RandomForestRegressor(n_estimators=num_trees,
                                      max_leaf_nodes=n_critics,
                                      min_samples_leaf=min_cluster_size)
        dtree.fit(data_z, data_p)
        cluster_labels = dtree.apply(data_z)
        cluster_ids = [
            np.unique(cluster_labels[:, c])
            for c in range(cluster_labels.shape[1])
        ]
    elif cluster_type == "kmeans":
        from sklearn.cluster import KMeans
        kmeans = KMeans(n_clusters=n_critics).fit(data_z)
        cluster_labels = kmeans.labels_.reshape(-1, 1)
        cluster_ids = [np.unique(cluster_labels)]
    elif cluster_type == "random_points":
        center_ids = np.random.choice(np.arange(data_z.shape[0]),
                                      size=n_critics,
                                      replace=False)
        cluster_labels = np.zeros((data_z.shape[0], n_critics))
        cluster_ids = np.ones((n_critics, 1))
        for it, center in enumerate(center_ids):
            distances = np.linalg.norm(data_z - data_z[center], axis=1)
            cluster_members = np.argsort(distances)[:min_cluster_size]
            cluster_labels[cluster_members, it] = 1
    else:
        raise Exception("Unknown option {}".format(cluster_type))

    if critic_type == "Gaussian":
        # We put a symmetric gaussian encompassing
        # all the data points of each cluster of each clustering
        center_grid = []
        precision_grid = []
        normalizers = []

        data_z = np.array(data_z)

        for tree in range(cluster_labels.shape[1]):
            for leaf in cluster_ids[tree]:
                center = np.mean(
                    data_z[cluster_labels[:, tree].flatten() == leaf, :],
                    axis=0)
                distance = np.linalg.norm(data_z - center,
                                          axis=1) / data_z.shape[1]
                precision = 1. / (np.sqrt(2) *
                                  (np.sort(distance)[min_cluster_size]))
                normalizer = (precision**n_instruments) * np.sum(
                    np.exp(-(precision * distance)**2)) / (np.power(
                        2. * np.pi, n_instruments / 2.))
                normalizers.append(normalizer)
                center_grid.append(center)
                precision_grid.append(precision)
        # The proposed normalizing constant results in too small function values
        # which result in too small losses and respective lack of scaling
        # when using the exp function for the weights update.
        # The code is kept for future fixes but overwritten
        # with the following command.
        # TODO: Explore normalizers and normalization of f(x)
        normalizers = np.ones(len(center_grid), dtype="float32")
        normalizers = np.array(normalizers, dtype="float32")
        center_grid = np.array(center_grid, dtype="float32")
        precision_grid = np.array(precision_grid, dtype="float32")

        normalizers = tf.constant(normalizers, name="normalizers")
        center_grid = tf.constant(center_grid, name="centers")
        precision_grid = tf.constant(precision_grid, name="precisions")
    else:
        raise NotImplementedError("Uniform functions not supported.")

    return normalizers, precision_grid, center_grid
Beispiel #21
0
class PDFRandomForestRegressor(BaseEstimator, RegressorMixin):
    """A normal random forest, except that it stores the final leaf positions and delay times for each row of the training set. It will also have a specialized scoring method."""
    def __init__(self, delaymin, delaymax, **kwargs):
        self.rforest = RandomForestRegressor(**kwargs)
        self.delay_min = delaymin
        self.delay_max = delaymax
        self.delay_bin_indices = np.arange(self.delay_max - self.delay_min + 1)
        self.delay_bin_values = np.arange(self.delay_min, self.delay_max + 1)
        #For each random forest, a dictionary mapping node id numbers to numpy arrays is also stored. These numpy arrays contain a histogram of the number of training models which fell into that node and their delay times.
        self.node_delay_pdfs = [{}] * self.rforest.n_estimators

    def fit(self, X, y, compute_pdf=False):
        y_fit = self.restrict_range(y)
        self.rforest.fit(X, y_fit)

        if compute_pdf == True:
            #Get the node ids for the training set:
            self.set_node_pdfs(X, y_fit)

        return self

    def set_node_pdfs(self, X, y):
        y_fit = self.restrict_range(y)
        #Map the y values onto indices for the arrays:
        y_indices = self.map_y_vals(y_fit)
        nodes = self.apply(X)

        #For each tree, make a 2D array containing the full range of integer target values along one axis (first axis), and the unique nodes along the other. Now, when the regression predicts a set of nodes for a given set of inputs, the full delay time distribution can be extracted by taking a slice along the unique node axis
        for i in range(nodes.shape[1]):
            unique_nodes, idxes = np.unique(nodes[:, i], return_inverse=True)
            unique_node_indices = np.arange(len(unique_nodes) + 1)
            node_dict = {
                unique_nodes[i]: unique_node_indices[i]
                for i in range(len(unique_node_indices) - 1)
            }
            node_indices = unique_node_indices[idxes]
            pdf_arr, xedges, yedges = np.histogram2d(
                y_fit,
                node_indices,
                bins=[self.delay_bin_values, unique_node_indices])
            #print 'testing',np.sum(pdf_arr)
            self.node_delay_pdfs[i] = {
                'node_dict': node_dict,
                'pdf_arr': pdf_arr
            }

    def restrict_range(self, y):
        y_restrict = y.copy()
        y_restrict[y < self.delay_min] = self.delay_min
        y_restrict[y > self.delay_max - 1] = self.delay_max - 1
        return y_restrict

    def map_y_vals(self, y):
        y_map = self.restrict_range(y)
        y_indices = y_map - self.delay_min
        return y_indices

    def predict(self, X):
        return self.rforest.predict(X)

    #Instead of just the normal prediction, which I believe just gives the average value of everything in the leaf node, predict a set of quantiles:
    def predict_percentiles(self, X, percentiles):
        p_nodes = self.apply(X)
        pdf_arr = self.get_node_pdfs(p_nodes)
        #print np.sum(pdf_arr,axis=1)
        sys.exit
        cdf_arr = np.cumsum(pdf_arr, axis=1)
        cdf_arr_frac = (cdf_arr.T / cdf_arr[:, -1].astype(np.float)).T
        #print pdf_arr[0,:]
        #print cdf_arr_frac[0,:]
        #sys.exit(1)
        #print "test",cdf_arr_frac.shape,len(percentiles)
        percentile_yvals = np.zeros((cdf_arr_frac.shape[0], len(percentiles)),
                                    dtype=np.int)
        for i, ptile in enumerate(percentiles):
            temp_cdf_arr_frac = cdf_arr_frac.copy(
            )  #These steps ensure that the y value is taken as the first index where the cdf goes above the percentile
            temp_cdf_arr_frac[temp_cdf_arr_frac < ptile / 100.] = 1000
            indices = np.argmin(temp_cdf_arr_frac - ptile / 100., axis=1)
            #indices = np.argmin(np.abs(cdf_arr_frac-ptile/100.),axis=1)
            #print indices[0]
            percentile_yvals[:, i] = self.delay_bin_values[indices]
            #print i,self.delay_bin_values[indices]

        #print pdf_arr[0,:]
        #print cdf_arr_frac[0,:]
        #print percentile_yvals[0,:],percentile_yvals.shape
        #sys.exit(1)
        return percentile_yvals

    def compute_percentiles(self, X, y):
        y_fit = self.restrict_range(y)
        y_indices = self.map_y_vals(y_fit).astype(np.int)
        p_nodes = self.apply(X)
        pdf_arr = self.get_node_pdfs(p_nodes)
        cdf_arr = np.cumsum(pdf_arr, axis=1)
        cdf_arr_frac = (cdf_arr.T / cdf_arr[:, -1].astype(np.float)).T
        #print cdf_arr_frac[0,:]
        #print self.delay_bin_values
        #print cdf_arr_frac.shape,y_fit.shape,y_fit[0]
        #print 'test',cdf_arr_frac.shape,y_fit.shape,y_indices.min(),y_indices.max(),self.delay_bin_values.shape
        #Now just need to compute the percentiles for all the y_indices
        cdf_at_y = cdf_arr_frac[np.arange(len(y_indices)), y_indices]
        #print "debug",cdf_at_y[0]
        return cdf_at_y
        # print cdf_at_y[:10],cdf_at_y.shape

    def get_node_pdfs(self, nodes):
        pdf_arr = np.zeros((nodes.shape[0], len(self.delay_bin_values) - 1),
                           dtype=np.int)
        #print nodes.shape,pdf_arr.shape
        for i, node_info in enumerate(self.node_delay_pdfs):
            #print i,node_info['node_dict']
            node_ids = [node_info['node_dict'][node] for node in nodes[:, i]]
            #print node_ids
            #print node_info['pdf_arr'].shape
            #print 'debug',node_info['pdf_arr'][:,node_ids[0]]
            temp_arr = np.array(
                [node_info['pdf_arr'][:, node_id] for node_id in node_ids],
                dtype=pdf_arr.dtype)
            pdf_arr += temp_arr
            #print 'temp',temp_arr[0,:]
            #print ""
        return pdf_arr

    def apply(self, X):
        return self.rforest.apply(X)

    def score(self, X, y):
        return self.rforest.score(X, y)

    #Compute how good each predicted value is based on how far away it is from the median value in percentiles:
    def score_percentiles(self, X, y):
        #First, compute the medians:
        y_med = self.predict_percentiles(X, [50]).ravel()
        #print y_med.shape
        percentiles = self.compute_percentiles(X, y)
        med_percentiles = self.compute_percentiles(
            X, y_med
        )  #Have to do this step to take into account the discrete nature of the y values.
        #print med_percentiles[:10],percentiles[:10]
        return 1. - np.sum((med_percentiles - percentiles)**2) / float(len(y))
Beispiel #22
0
class QuantileRandomForestRegressor:
    """A quantile random forest regressor based on the scikit-learn RandomForestRegressor
    
    A wrapper around the RandomForestRegressor which summarizes based on quantiles rather than
    the mean. Note that quantile predicitons take much longer than mean predictions.

    Parameters
    ----------
    nthreads : int, default=1
        number of threads to used
    rf_kwargs : array or array like
        kwargs to be passed to the RandomForestRegressor
    
    See Also
    --------
    https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html?highlight=randomforestregressor#sklearn.ensemble.RandomForestRegressor.apply
    """
    def __init__(self, nthreads=1, **rf_kwargs):
        rf_kwargs['n_jobs'] = nthreads
        self.forest = RandomForestRegressor(**rf_kwargs)
        set_num_threads(nthreads)

    def fit(self, X, y, sample_weight=None):
        """
        Build a forest of trees from the training set (X, y).
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples. Internally, its dtype will be converted
            to ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csc_matrix``.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.
        Returns
        -------
        self : object
        """
        self.forest.fit(X, y, sample_weight)
        self.trainy = y.copy()
        self.trainX = X.copy()

    def predict(self, X, qntl):
        """
        Predict regression target for X.
        The predicted regression target of an input sample is computed as the
        quantile predicted regression targets of the trees in the forest.
        
        Note: Not possible for multioutput regression.
        
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        qntl : {array-like} of shape (n_quantiles)
            Quantile or sequence of quantiles to compute, which must be between
            0 and 1 inclusive. Passed to numpy.quantile.
        Returns
        -------
        y : ndarray of shape (n_samples, n_quantiles)
            The predicted values.
        """
        if len(self.trainy.shape)>1:
            raise RuntimeError("Quantile prediction is not possible with multioutput regression.")
        
        qntl = np.asanyarray(qntl)
        ntrees = self.forest.n_estimators
        ntrain = self.trainy.shape[0]
        train_tree_node_ID = np.zeros([ntrain, ntrees])
        npred = X.shape[0]
        pred_tree_node_ID = np.zeros([npred, ntrees])

        for i in range(ntrees):
            train_tree_node_ID[:, i] = self.forest.estimators_[i].apply(self.trainX)
            pred_tree_node_ID[:, i] = self.forest.estimators_[i].apply(X)

        ypred_pcts = find_quant(self.trainy, train_tree_node_ID,
                                pred_tree_node_ID, qntl)

        return ypred_pcts
    
    def predict_sample(self, X, n_draws):
        """
        Predict regression target for X.
        The predicted regression target of an input sample is computed as a
        random sample of the predicted regression targets of the trees in the forest.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        n_sample : {int}
            number of sample to draw from the predicted regression targets
        Returns
        -------
        y : ndarray of shape (n_samples, n_draws) or (n_samples, n_outputs, n_draws)
            The predicted values.
        """
        ntrees = self.forest.n_estimators
        ntrain = self.trainy.shape[0]
        train_tree_node_ID = np.zeros([ntrain, ntrees])
        npred = X.shape[0]
        pred_tree_node_ID = np.zeros([npred, ntrees])

        for i in range(ntrees):
            train_tree_node_ID[:, i] = self.forest.estimators_[i].apply(self.trainX)
            pred_tree_node_ID[:, i] = self.forest.estimators_[i].apply(X)

        ypred_draws = find_sample(self.trainy, train_tree_node_ID,
                                pred_tree_node_ID, n_draws)

        return ypred_draws

    def apply(self, X):
        """
        wrapper for sklearn.ensemble.RandomForestRegressor.apply

        Apply trees in the forest to X, return leaf indices.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        X_leaves : ndarray of shape (n_samples, n_estimators)
            For each datapoint x in X and for each tree in the forest,
            return the index of the leaf x ends up in.
        """
        return self.forest.apply(X)

    def decision_path(self, X):
        """
        wrapper for sklearn.ensemble.RandomForestRegressor.decision_path

        Return the decision path in the forest.
        .. versionadded:: 0.18
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        indicator : sparse matrix of shape (n_samples, n_nodes)
            Return a node indicator matrix where non zero elements indicates
            that the samples goes through the nodes. The matrix is of CSR
            format.
        n_nodes_ptr : ndarray of shape (n_estimators + 1,)
            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
            gives the indicator value for the i-th estimator.
        """
        return self.forest.decision_path(X)

    def set_params(self, **params):
        """
        wrapper for sklearn.ensemble.RandomForestRegressor.set_params

        Set the parameters of this estimator.
        The method works on simple estimators as well as on nested objects
        (such as pipelines). The latter have parameters of the form
        ``<component>__<parameter>`` so that it's possible to update each
        component of a nested object.
        Parameters
        ----------
        **params : dict
            Estimator parameters.
        Returns
        -------
        self : object
            Estimator instance.
        """
        return self.forestset_params(**params)
Beispiel #23
0
    print "Instance", i
    print "Bias (trainset mean)", bias[i]
    print "Feature contributions:"
    for c, feature in sorted(zip(contributions[i], boston.feature_names),
                             key=lambda x: -abs(x[0])):
        print feature, round(c, 2)
    print "-" * 20

# In[42]:

print prediction
print bias + np.sum(contributions, axis=1)

# In[43]:

#  the basic feature importance feature provided by sklearn
fit1.feature_importances_

# In[44]:

# treeinterpreter uses the apply function to retrieve the leave indicies with the help of which,
# the tree path is retrieved

rf.apply

# In[47]:

rf.apply(instances)

# In[ ]:
Beispiel #24
0
def test_varying_samples_per_node(meta_m, m):
    print("Type m: ", m.dtype)
    path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_ytest.csv"

    Y = m[:, 0]
    X = m[:, 1:]

    # Ynz, Xnz = trim_value(Y, X, 0)

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
        X, Y, meta_m, train_size=0.60, random_state=0)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    nrow = 0
    nrow2 = 0

    fig, ax = plt.subplots(nrows=5, ncols=5)
    fig2, ax2 = plt.subplots(nrows=1, ncols=5, figsize=(20, 8))
    tit1 = "Effect of the number of samples per leaf node on the predicted distributions (n_esti=5)"
    tit2 = "Predicted distributions from above in function of the number of samples per leaf node"

    plt.suptitle(tit2, size=20)

    mean_ytest = np.mean(ytest)

    for samples_per_leaf in range(500, 1400, 200):

        print("Samples per leaf node: ", samples_per_leaf)

        ensemble = RandomForestRegressor(n_estimators=1,
                                         min_samples_leaf=samples_per_leaf,
                                         bootstrap=False)

        ensemble.fit(xtrain, ytrain)

        leaves = ensemble.apply(xtrain)

        dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

        pack = fitting_four_models_leaf_nodes(dicori)

        pred = predicting_four_models_leaf_nodes(ytest, xtest, pack).T

        print(pred.shape, meta_m_test.shape)

        stack = np.hstack((meta_m_test, pred))

        print("Shape of stack: ", stack.shape)

        dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

        header = "rowid;longitude;latitude;predpoi;prednb;predzip;predzinb"
        fmts = ["%d", "%d", "%d", "%.4f", "%.4f", "%.4f", "%.4f"]
        # np.savetxt(path_out, stack, delimiter=";", fmt=fmts, header=header)

        pred_rf = ensemble.predict(xtest)
        rmse_rf = np.sqrt(mean_squared_error(ytest, pred_rf))

        print()
        print("RMSE RF: ", rmse_rf)

        # ax = plot_compare_histograms(ax, nrow, samples_per_leaf, mean_ytest, ytest, pred, pred_rf)

        print(pred.shape, pred_rf.reshape(-1, 1).shape)
        allpreds = np.hstack((pred, pred_rf.reshape(-1, 1)))
        labels = ['Poisson', 'NB', 'ZIP', "ZINB", "RF-Classic"]

        labelsize = 16
        rcParams['xtick.labelsize'] = labelsize
        rcParams['ytick.labelsize'] = labelsize
        ax2[nrow2].set_title("SPL: {0}".format(samples_per_leaf))
        ax2[nrow2].set_facecolor('#F5F5F5')
        box = ax2[nrow2].boxplot(allpreds, patch_artist=True)
        # ax2[nrow2].xaxis.set_ticks(labels)
        ax2[nrow2].set_xticklabels(labels,
                                   fontsize=16,
                                   fontdict={'fontsize': 16})
        colors = ['#A87128', '#004561', '#3C5B43', '#85243C', '#615048']
        for patch, color in zip(box['boxes'], colors):
            patch.set_facecolor(color)

        ax2[nrow2].yaxis.grid(True,
                              linestyle='-',
                              which='major',
                              color='lightgrey',
                              alpha=0.5)
        nrow += 1
        nrow2 += 1

    plt.show()
Beispiel #25
0
        test.append(test_data[i])
        test_y.append(float(y[i]))
    else:
        data.append(test_data[i])
    i += 1
test1_y  = np.asarray(test_y, dtype=np.float32)
#test  = np.asarray(test, dtype=np.float32)
#test1_y = test1_y.transpose
#print(test_y)   
#print data

for i1 in range(0,10):
    forest = RandomForestRegressor(n_estimators = 100, max_depth = 3)
#print("--- %s seconds ---" % (time.clock() - start_time))
    forest = forest.fit(test,test1_y) 
    out1 = forest.apply(test) 
    out = forest.score(test,test1_y) 
    print out
    print out1
#print("--- %s seconds ---" % (time.clock() - start_time))
    output = forest.predict(data)
    i = 0
    error = 0
    error1 = 0
    while i < len(output):
        if abs(output[i] - y[test_len+i]) > 0.01:
            #print(i)
            #print(y[test_len+i])
            #print(output[i])
            error += abs(output[i] - y[test_len+i])
            error1 += 1
Beispiel #26
0
                                                random_state=0)

print('Type xtrain: ', xtrain.dtype)

print("Raw data: ", Y.shape, X.shape)
print("Trim data: ", Ynz.shape, Xnz.shape)
print("Train data", ytrain.shape, xtrain.shape)
print("Test data: ", ytest.shape, xtest.shape)

ensemble = RandomForestRegressor(n_estimators=100,
                                 min_samples_leaf=500,
                                 bootstrap=False)

ensemble.fit(xtrain, ytrain)

leaves = ensemble.apply(xtrain)

dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

dicpoi = poisson_predictions_leaf_nodes(dicori)

dicens = ensemble_predictions_leaf_nodes(dicori)

ypred_poi = np.mean(poisson_predictions_testing(dicori, dicpoi, xtest), axis=1)

ypred_ens = ensemble_predictions_testing(ensemble, xtest)

plot_poisson_ensemble_raw(ypred_poi, ypred_ens, Yavg, ytest)

# rmse = np.sqrt(mean_squared_error(ytest, ypred_poi))
#
Beispiel #27
0
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = estimator.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = estimator.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        continue

    if X_test[sample_id, feature[node_id]] <= threshold[node_id]:
        threshold_sign = "<="
Beispiel #28
0
    def __init__(self,
                 X_train,
                 MR_train,
                 X_val,
                 MR_val,
                 fe_type="rf",
                 fe=None,
                 n_estimators=200,
                 max_features=0.5,
                 min_samples_leaf=10,
                 regularization=0.001):

        # Features and the target model response
        self.X_train = X_train
        self.MR_train = MR_train
        self.X_val = X_val
        self.MR_val = MR_val

        # Forest Ensemble Parameters
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf

        # Local Linear Model Parameters
        self.regularization = regularization

        # Data parameters
        num_features = X_train.shape[1]
        self.num_features = num_features
        num_train = X_train.shape[0]
        self.num_train = num_train
        num_val = X_val.shape[0]

        # Fit a Forest Ensemble to the model response
        if fe is None:
            if fe_type == "rf":
                fe = RandomForestRegressor(n_estimators=n_estimators,
                                           min_samples_leaf=min_samples_leaf,
                                           max_features=max_features)
            elif fe_type == "gbrt":
                fe = GradientBoostingRegressor(
                    n_estimators=n_estimators,
                    min_samples_leaf=min_samples_leaf,
                    max_features=max_features,
                    max_depth=None)
            else:
                print("Unknown FE type ", fe)
                import sys
                sys.exit(0)
            fe.fit(X_train, MR_train)
        else:
            self.n_estimators = n_estimators = len(fe.estimators_)
        self.fe = fe

        train_leaf_ids = fe.apply(X_train)
        self.train_leaf_ids = train_leaf_ids

        val_leaf_ids_list = fe.apply(X_val)

        # Compute the feature importances: Non-normalized @ Root
        scores = np.zeros(num_features)
        if fe_type == "rf":
            for i in range(n_estimators):
                splits = fe[
                    i].tree_.feature  #-2 indicates leaf, index 0 is root
                if splits[0] != -2:
                    scores[splits[0]] += fe[i].tree_.impurity[
                        0]  #impurity reduction not normalized per tree
        elif fe_type == "gbrt":
            for i in range(n_estimators):
                splits = fe[
                    i, 0].tree_.feature  #-2 indicates leaf, index 0 is root
                if splits[0] != -2:
                    scores[splits[0]] += fe[i, 0].tree_.impurity[
                        0]  #impurity reduction not normalized per tree
        self.feature_scores = scores
        mostImpFeats = np.argsort(-scores)

        # Find the number of features to use for MAPLE
        retain_best = 0
        rmse_best = np.inf
        for retain in range(1, num_features + 1):

            # Drop less important features for local regression
            X_train_p = np.delete(X_train, mostImpFeats[retain:], axis=1)
            X_val_p = np.delete(X_val, mostImpFeats[retain:], axis=1)

            lr_predictions = np.empty([num_val], dtype=float)

            for i in range(num_val):

                weights = self.training_point_weights(val_leaf_ids_list[i])

                # Local linear model
                lr_model = Ridge(alpha=regularization)
                lr_model.fit(X_train_p, MR_train, weights)
                lr_predictions[i] = lr_model.predict(X_val_p[i].reshape(1, -1))

            rmse_curr = np.sqrt(mean_squared_error(lr_predictions, MR_val))

            if rmse_curr < rmse_best:
                rmse_best = rmse_curr
                retain_best = retain

        self.retain = retain_best
        self.X = np.delete(X_train, mostImpFeats[retain_best:], axis=1)
Beispiel #29
0
def test_varying_samples_per_node(meta_m, m):

    path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/to_keep_for_taylor/remove_testing_SPL{0}_NESTI{1}.csv"

    print("Type m: ", m.dtype)

    Y = m[:, 0]
    X = m[:, 1:]

    Ynz, Xnz, meta_m_nz = trim_value(Y, X, meta_m, 0)

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
        X, Y, meta_m, train_size=0.60, random_state=0)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    n_estimators = [5]
    samples_per_leaf = [1000]

    start_all = time.time()

    fig, ax = plt.subplots(nrows=5, ncols=5)

    nrow = 0

    for spl in samples_per_leaf:
        start_it = time.time()
        for n_esti in n_estimators:

            print()
            print("Analysis: RF with Skewed Leaves")
            print("Samples per leaf node: ", spl)
            print("Number of estimators: ", n_esti)
            print("-" * 50)

            ensemble = RandomForestRegressor(n_estimators=n_esti,
                                             min_samples_leaf=spl,
                                             bootstrap=True)

            ensemble.fit(xtrain, ytrain)

            leaves = ensemble.apply(xtrain)

            for feature in leaves.T:
                nnodes = str(len(np.unique(feature))) + "\n"
                f.write(nnodes)

            dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

            pack = fitting_four_models_leaf_nodes(dicori)

            pred_rf = ensemble.predict(xtest)

            pred_sk = testing_four_models_leaf_nodes_v2(
                ensemble, spl, n_esti, ytest, xtest, pack, pred_rf,
                meta_m_test).T

            print("Now saving")

            print(meta_m_test.shape, ytest.shape, pred_sk.shape, pred_rf.shape)

            stack = np.hstack((meta_m_test, ytest.reshape(-1, 1), pred_sk,
                               pred_rf.reshape(-1, 1)))

            # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

            write_proportion_of_zeros(spl, n_esti)

            # with open(path_out.format(spl, n_esti), "w", newline="") as w:
            #     writer = csv.writer(w, delimiter=";")
            #     for item in stack:
            #         writer.writerow(item)

            plot_compare_histograms(ax, nrow, spl, ytest, pred_sk, pred_rf)

            nrow += 1

            stop_it = time.time()
            print("--- Iteration elapsed {0} minutes ---".format(
                np.divide(stop_it - start_it, 60)))

    plt.show()
    end_all = time.time()
    print("--- Full program elapsed {0} hours ---".format(
        np.divide(end_all - start_all, 3600)))
Beispiel #30
0
class _LinearForest(BaseEstimator):
    """Base class for Linear Forest meta-estimator.

    Warning: This class should not be used directly. Use derived classes
    instead.
    """
    def __init__(self, base_estimator, *, n_estimators, max_depth,
                 min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
                 max_features, max_leaf_nodes, min_impurity_decrease,
                 bootstrap, oob_score, n_jobs, random_state, ccp_alpha,
                 max_samples):

        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.bootstrap = bootstrap
        self.oob_score = oob_score
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.ccp_alpha = ccp_alpha
        self.max_samples = max_samples

    def _sigmoid(self, y):
        """Expit function (a.k.a. logistic sigmoid).

        Parameters
        ----------
        y : array-like of shape (n_samples, )
            The array to apply expit to element-wise.

        Returns
        -------
        y : array-like of shape (n_samples, )
            Expits.
        """
        return np.exp(y) / (1 + np.exp(y))

    def _inv_sigmoid(self, y):
        """Logit function.

        Parameters
        ----------
        y : array-like of shape (n_samples, )
            The array to apply logit to element-wise.

        Returns
        -------
        y : array-like of shape (n_samples, )
            Logits.
        """
        y = y.clip(1e-3, 1 - 1e-3)

        return np.log(y / (1 - y))

    def _fit(self, X, y, sample_weight=None):
        """Build a Linear Boosting from the training set (X, y).

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.

        y : array-like of shape (n_samples, ) or also (n_samples, n_targets) for
            multitarget regression.
            The target values (class labels in classification, real numbers in
            regression).

        sample_weight : array-like of shape (n_samples, ), default=None
            Sample weights.

        Returns
        -------
        self : object
        """
        if not hasattr(self.base_estimator, "fit_intercept"):
            raise ValueError(
                "Only linear models are accepted as base_estimator. "
                "Select one from linear_model class of scikit-learn.")

        if not is_regressor(self.base_estimator):
            raise ValueError(
                "Select a regressor linear model as base_estimator.")

        n_sample, self.n_features_in_ = X.shape

        if hasattr(self, "classes_"):
            class_to_int = dict(map(reversed, enumerate(self.classes_)))
            y = np.array([class_to_int[i] for i in y])
            y = self._inv_sigmoid(y)

        self.base_estimator_ = deepcopy(self.base_estimator)
        self.base_estimator_.fit(X, y, sample_weight)
        resid = y - self.base_estimator_.predict(X)

        criterion = "squared_error" if _sklearn_v1 else "mse"

        self.forest_estimator_ = RandomForestRegressor(
            n_estimators=self.n_estimators,
            criterion=criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            bootstrap=self.bootstrap,
            oob_score=self.oob_score,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
            ccp_alpha=self.ccp_alpha,
            max_samples=self.max_samples,
        )
        self.forest_estimator_.fit(X, resid, sample_weight)

        if hasattr(self.base_estimator_, "coef_"):
            self.coef_ = self.base_estimator_.coef_

        if hasattr(self.base_estimator_, "intercept_"):
            self.intercept_ = self.base_estimator_.intercept_

        self.feature_importances_ = self.forest_estimator_.feature_importances_

        return self

    def apply(self, X):
        """Apply trees in the forest to X, return leaf indices.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        X_leaves : ndarray of shape (n_samples, n_estimators)
            For each datapoint x in X and for each tree in the forest,
            return the index of the leaf x ends up in.
        """
        check_is_fitted(self, attributes="base_estimator_")

        return self.forest_estimator_.apply(X)

    def decision_path(self, X):
        """Return the decision path in the forest.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        indicator : sparse matrix of shape (n_samples, n_nodes)
            Return a node indicator matrix where non zero elements indicates
            that the samples goes through the nodes. The matrix is of CSR
            format.

        n_nodes_ptr : ndarray of shape (n_estimators + 1, )
            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
            gives the indicator value for the i-th estimator.
        """
        check_is_fitted(self, attributes="base_estimator_")

        return self.forest_estimator_.decision_path(X)
Beispiel #31
0
def test_RandomForest():

    X1 = np.arange(0, 10, 0.1)
    X2 = np.arange(10, 20, 0.1)

    y = np.sin(X1).ravel() + np.cos(X2).ravel()
    X_df = pd.DataFrame(np.array([X1, X2]).T, columns=['x1', 'x2'])

    rf_regr = RandomForestRegressor(n_estimators=1000,
                                    max_depth=5,
                                    bootstrap=False)
    rf_regr.fit(X_df, y)
    with StopWatch("LucidEnsemble Random Forest construction"):
        lucid_rf = make_LucidEnsemble(rf_regr,
                                      feature_names=X_df.columns,
                                      print_precision=5)

    # If this is not float32 there are precision errors
    # apparently DecisionTreeRegressor within RandomForestRegressor
    # requires that the matrix be of type float32 so there is a
    # type conversion from types to float32
    X_df = X_df.astype(np.float32)

    with StopWatch("Scikit-learn Random Forest prediction"):
        rf_pred = rf_regr.predict(X_df)
    with StopWatch("Lucid Random Forest (non-compressed) prediction"):
        lucid_rf_pred = lucid_rf.predict(X_df)

    ######################################################
    # test prediction outputted from LucidEnsemble
    np.testing.assert_almost_equal(lucid_rf_pred, rf_pred)
    assert (np.all(rf_regr.apply(X_df) == lucid_rf.apply(X_df)))

    with StopWatch("Compression of Lucid Random Forest"):
        compressed_lucid_rf = lucid_rf.compress()
    print("{} unique nodes and {} # of estimators".format(
        compressed_lucid_rf.n_leaves, len(lucid_rf)))

    with StopWatch("Lucid Random Forest (compressed) prediction"):
        crf_pred = compressed_lucid_rf.predict(X_df)
    np.testing.assert_almost_equal(crf_pred, rf_pred)

    ######################################################
    # test comparison, compare the leaves of two
    # LucidEnsembles made from the the same arguments
    lucid_rf2 = make_LucidEnsemble(rf_regr,
                                   feature_names=X_df.columns,
                                   print_precision=3)
    compressed_lucid_rf2 = lucid_rf2.compress()

    assert (set(compressed_lucid_rf.leaves) == set(
        compressed_lucid_rf2.leaves))

    script_dir = os.path.dirname(__name__)
    ######################################################
    # test pickling functionality
    pickle_path = os.path.join(script_dir, 'lucid_rf.pkl')
    with open(pickle_path, 'wb') as fh:
        pickle.dump(lucid_rf, fh)
    with open(pickle_path, 'rb') as fh:
        lucid_rf_pickle = pickle.load(fh)
        np.testing.assert_almost_equal(lucid_rf_pickle.predict(X_df),
                                       lucid_rf_pred)
    os.remove(pickle_path)

    pickle_path = os.path.join(script_dir, 'compressed_lucid_rf.pkl')
    with open(pickle_path, 'wb') as fh:
        pickle.dump(compressed_lucid_rf, fh)
    with open(pickle_path, 'rb') as fh:
        compressed_lucid_rf_pickle = pickle.load(fh)
        np.testing.assert_almost_equal(
            compressed_lucid_rf_pickle.predict(X_df), crf_pred)
    os.remove(pickle_path)
def test_varying_samples_per_node(meta_m, m):

    hs = []

    path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/skewed_leaves/explore_trees/testing_SPL{0}_NESTI{1}.csv"

    print("Type m: ", m.dtype)

    Y = m[:, 0]
    X = m[:, 1:]

    Ynz, Xnz, meta_m_nz = trim_value(Y, X, meta_m, 0)

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
        X, Y, meta_m, train_size=0.60, random_state=0)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    nt = 10

    n_estimators = [nt]
    samples_per_leaf = [100]

    start_all = time.time()

    fig, ax = plt.subplots(nrows=6, ncols=5, sharex=False, sharey=False)
    plt.subplots_adjust(wspace=0.5, hspace=0.5)

    nrow = 0

    for spl in samples_per_leaf:
        start_it = time.time()
        for n_esti in n_estimators:

            print()
            print("Analysis: RF with Skewed Leaves")
            print("Samples per leaf node: ", spl)
            print("Number of estimators: ", n_esti)
            print("-" * 50)

            ensemble = RandomForestRegressor(n_estimators=n_esti,
                                             min_samples_leaf=spl,
                                             bootstrap=True)

            ensemble.fit(xtrain, ytrain)

            leaves = ensemble.apply(xtrain)

            dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

            pack = fitting_four_models_leaf_nodes(dicori)

            pred_rf = ensemble.predict(xtest)

            pred_sk = testing_four_models_leaf_nodes_v2(
                ensemble, spl, n_esti, ytest, xtest, pack, pred_rf,
                meta_m_test).T

            stack = np.hstack((meta_m_test, ytest.reshape(-1, 1), pred_sk,
                               pred_rf.reshape(-1, 1)))

            save_tree_graph(ensemble, nt)

            # dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

            # write_proportion_of_zeros(spl, n_esti)

            # with open(path_out.format(spl, n_esti), "w", newline="") as w:
            #     writer = csv.writer(w, delimiter=";")
            #     for item in stack:
            #         writer.writerow(item)

            # h = plot_compare_histograms(ax, nrow, spl, ytest, pred_sk, pred_rf)
            # hs.append(h)
            # plot_pred_vs_true(ax, nrow, spl, ytest, pred_sk, pred_rf)

        nrow += 1
        stop_it = time.time()
        print("--- Iteration elapsed {0} minutes ---".format(
            np.divide(stop_it - start_it, 60)))

    # fig.legend(hs, loc="center right", borderaxespad=0.1, title="Legend Title

    # path_fig_out = r"/home/irene/Pictures/0403_Compare_Histograms_full05.png"
    # manager = plt.get_current_fig_manager()
    # manager.window.showMaximized()
    # plt.pause(10)
    # plt.gcf().savefig(path_fig_out, format='png', dpi=300)

    end_all = time.time()
    print("--- Full program elapsed {0} hours ---".format(
        np.divide(end_all - start_all, 3600)))
def test_varying_samples_per_node(meta_m, m):
    print("Type m: ", m.dtype)
    path_out = r"/home/irene/PycharmProjects/04_Risk_Model/data/poisson_leaves/prediction_ytest.csv"

    Y = m[:, 0]
    X = m[:, 1:]

    # Ynz, Xnz = trim_value(Y, X, 0)

    xtrain, xtest, ytrain, ytest, meta_m_train, meta_m_test = train_test_split(
        X, Y, meta_m, train_size=0.60, random_state=0)

    print('Type xtrain: ', xtrain.dtype)
    print("Raw data: ", Y.shape, X.shape)
    print("Train data", ytrain.shape, xtrain.shape)
    print("Test data: ", ytest.shape, xtest.shape)

    for samples_per_leaf in range(500, 600, 100):

        print("Samples per leaf node: ", samples_per_leaf)

        ensemble = RandomForestRegressor(n_estimators=1,
                                         min_samples_leaf=1000,
                                         bootstrap=False)

        ensemble.fit(xtrain, ytrain)

        leaves = ensemble.apply(xtrain)

        dicori = samples_per_leaf_node(leaves, xtrain, ytrain)

        pack = fitting_four_models_leaf_nodes(dicori)

        pred = predicting_four_models_leaf_nodes(ytest, xtest, pack).T

        print(pred.shape, meta_m_test.shape)

        stack = np.hstack((meta_m_test, pred))

        print("Shape of stack: ", stack.shape)

        dicens = ensemble_predictions_leaf_nodes(ensemble, dicori)

        header = "rowid;longitude;latitude;predpoi;prednb;predzip;predzinb"
        fmts = ["%d", "%d", "%d", "%.4f", "%.4f", "%.4f", "%.4f"]
        # np.savetxt(path_out, stack, delimiter=";", fmt=fmts, header=header)

        pred_rf = ensemble.predict(xtest)
        rmse_rf = np.sqrt(mean_squared_error(ytest, pred_rf))

        print()
        print("RMSE RF: ", rmse_rf)

        plt.subplot(2, 3, 1)
        plt.hist(pred[:, 0], bins=50)
        plt.hist(ytest, bins=50)
        plt.subplot(2, 3, 2)
        plt.hist(pred[:, 1], bins=50)
        plt.hist(ytest, bins=50)
        plt.subplot(2, 3, 3)
        plt.hist(pred[:, 2], bins=50)
        plt.hist(ytest, bins=50)
        plt.subplot(2, 3, 4)
        plt.hist(pred[:, 3], bins=50)
        plt.hist(ytest, bins=50)
        plt.subplot(2, 3, 5)
        plt.hist(ytest, bins=50)
        plt.hist(pred_rf)
        plt.show()

        break