Python RFRの例、sklearn.ensemble.RFR Pythonの例

コード例 #1

0

ファイルを表示

ファイル: random_forest.py プロジェクト: high-mood/PSE-Backend

def random_forest(train_set, test_set):
    ''' Creates, trains and tests a random forest regressor, then writes results
    to terminal.

    Params:
    train_set: A list with training data.
    test_set: A list with test data.
    '''
    clf_energy = RFR(n_jobs=2, n_estimators=10)
    clf_happiness = RFR(n_jobs=2, n_estimators=10)

    # Fit the regressor models on the spotify statistics, fit them on the
    # mood stats determined by another research project.
    clf_energy.fit([row[3:16] for row in train_set], [row[1] for row in train_set])
    clf_happiness.fit([row[3:16] for row in train_set], [row[2] for row in train_set])

    result_energy = clf_energy.predict([row[3:16] for row in test_set])
    result_happiness = clf_happiness.predict([row[3:16] for row in test_set])

    # Compute the total absolute difference between the predicted and actual
    # moods.
    energy_mean = 0.0
    happiness_mean = 0.0
    for i in range(len(test_set)):
        energy_mean += abs(float(result_energy[i]) - float(test_set[i][1]))
        happiness_mean += abs(float(result_happiness[i]) - float(test_set[i][2]))

    energy_mean /= len(test_set)
    happiness_mean /= len(test_set)

    print("Avg discrepancy - Energy: " + str(energy_mean))
    print("Avg discrepancy - Happiness: " + str(happiness_mean))

コード例 #2

0

ファイルを表示

ファイル: rfr_fillna.py プロジェクト: OzupeSir/personal_credit

def rfr_fillna(df_all):
    '''
    func:对于原来的表格进行缺失值填充，使用的方法是随机森林
    
    paramas: 
    df_all:原来需要填充的表格
    
    return:df_adda（新的表格）,model（填充模型）,MinMax_1st(归一化模型1),MinMax_2nd(归一化模型2)
    '''

    # 将数据分段，选择好要进行预测的因变量和自变量
    user_id = df_all.iloc[:, 0]
    X = df_all.iloc[:, 1:-1]
    Y = df_all.iloc[:, -1]

    X1 = X.copy()
    Y2 = X1.iloc[:, 43:]
    sex = X1.iloc[:, 0]
    X2 = X1.iloc[:, 1:43]

    # 量纲归一化
    MinMax_1st = MinMaxScaler().fit(X2)
    X2.iloc[:, :] = MinMax_1st.transform(X2)

    X2 = pd.concat([sex, X2], axis=1)
    # 对于模型进行筛选
    model = {}
    krange = range(4, 30)
    for k in tqdm(list(Y2)):
        X_train = X2[Y2[k].notnull()]
        X_test = X2[Y2[k].isnull()]
        Y_train = Y2[k][Y2[k].notnull()]
        score = []
        for i in krange:
            rfr = RFR(min_samples_split=i, n_jobs=-1)
            score_each = cvs(rfr, X_train, Y_train, cv=3, n_jobs=-1).mean()
            score.append(score_each)
        best_choose = list(krange)[np.argmax(score)]
        rfr = RFR(min_samples_split=best_choose, n_jobs=-1)
        rfr = rfr.fit(X_train, Y_train)
        model[k] = rfr
        Y2[k][Y2[k].isnull()] = rfr.predict(X_test)

    # 对银行流水表再次量纲归一化
    MinMax_2nd = MinMaxScaler().fit(Y2)
    Y2.iloc[:, :] = MinMax_2nd.transform(Y2)

    df_adda = pd.concat([X2, Y2], axis=1)

    df_adda = pd.concat([user_id, df_adda, Y], axis=1)
    return df_adda, model, MinMax_1st, MinMax_2nd

コード例 #3

0

ファイルを表示

    def na_rf_interp(self,
                     my_data,
                     na_variables,
                     features='all',
                     rf_params=None):

        if not rf_params:
            rf = RFR()
        else:
            rf = RFR(rf_params)

        if type(my_data).__name__ == 'dict':
            my_data = pd.DataFrame.from_dict(my_data)

        # parse features
        if type(features
                ).__name__ == 'str' or type(features).__name__ != 'dict':
            features_ = {}
            for t in na_variables:
                if features == 'all':
                    features_[t] = [
                        x for x in my_data if not x in na_variables
                    ]
                elif type(features).__name__ == 'str':
                    features_[t] = [features]
                else:
                    features_[t] = features
            features = features_

        my_rfs = {}
        for f in na_variables:
            rf_ = rf

            #  nans
            id_na = np.isnan(my_data[f])
            if id_na.sum() in [0, my_data[f].size]:
                continue  # nothing to interpolate

            rf_.fit(
                my_data.filter(features[f])[(id_na - 1).astype(bool)].values,
                my_data[f][(id_na - 1).astype(bool)].values,
            )

            my_data[f][id_na] = rf_.predict(
                my_data.filter(features[f])[id_na].values)

            my_rfs[f] = rf_
            rf_ = None

        return my_data, my_rfs

コード例 #4

0

ファイルを表示

 def setup_random_forest(self):
     n_estimators = [
         int(x) for x in np.linspace(start=20, stop=1000, num=10)
     ]
     # Number of features to consider at every split
     max_features = ['auto', 'sqrt']
     # Maximum number of levels in tree
     max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
     max_depth.append(None)
     # Minimum number of samples required to split a node
     min_samples_split = [2, 5, 10]
     # Minimum number of samples required at each leaf node
     min_samples_leaf = [1, 2, 4]
     # Method of selecting samples for training each tree
     bootstrap = [True, False]  # Create the random grid
     rf = RFR()
     random_grid = {
         'n_estimators': n_estimators,
         'max_features': max_features,
         'max_depth': max_depth,
         'min_samples_split': min_samples_split,
         'min_samples_leaf': min_samples_leaf,
         'bootstrap': bootstrap
     }
     return RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               n_iter=30,
                               cv=3,
                               verbose=0,
                               random_state=42,
                               n_jobs=12)

コード例 #5

0

ファイルを表示

    def _fit(self, img, dot, tags, boxConstraints=[]):

        numFeatures = img.shape[1]
        if self._method == "RandomForest":
            from sklearn.ensemble import RandomForestRegressor as RFR

            regressor = RFR(n_estimators=self._ntrees,
                            max_depth=self._maxdepth)
            regressor.fit(img, dot)

        elif self._method == "svrBoxed-gurobi":
            regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon)
            regressor.fit(
                img, dot, tags,
                self.getOldBoxConstraints(boxConstraints, numFeatures))
        #elif self._method == "svrBoxed-gurobi":
        #    regressor = RegressorGurobi(C = self._C, epsilon = self._epsilon)
        #    regressor.fit(img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures
        #                                                           ))
        elif self._method == "BoxedRegressionGurobi":
            regressor = RegressorC(C=self._C, epsilon=self._epsilon)
            regressor.fitgurobi(img, dot, tags, boxConstraints)

        elif self._method == "BoxedRegressionCplex":
            regressor = RegressorC(C=self._C, epsilon=self._epsilon)
            regressor.fitcplex(img, dot, tags, boxConstraints)

        return regressor

コード例 #6

0

ファイルを表示

ファイル: Simple_Models.py プロジェクト: 24Beast/Machine-No-Learning

 def get_new_model(self):
     if (self.model_type.split("_")[-1] == "Regressor"):
         if (self.model_type == "Linear-Regressor"):
             from sklearn.linear_model import LinearRegression
             self.model = LinearRegression(**self.model_args)
         elif (self.model_type == "Support-Vector-Regressor"):
             import sklearn.svm as SVR
             self.model = SVR(**self.model_args)
         elif (self.model_type == "Decision-Tree-Regressor"):
             from sklearn.tree import DecisionTreeRegressor as DTR
             self.model = DTR(**self.model_args)
         elif (self.model_type == "Random-Forest-Regressor"):
             from sklearn.ensemble import RandomForestRegressor as RFR
             self.model = RFR(**self.model_args)
     else:
         if (self.model_type == "Logistic-Regression-Classifier"):
             from sklearn.linear_model import LogisticRegression
             self.model = LogisticRegression(**self.model_args)
         elif (self.model_type == "KNN-Classifier"):
             from sklearn.neighbors import KNeighborsClassifier as KNN
             self.model = KNN(**self.model_args)
         elif (self.model_type == "Support-Vector-Classifier"):
             import sklearn.svm as SVC
             self.model = SVC(**self.model_args)
         elif (self.model_type == "Naive-Bayes-Classifier"):
             from sklearn.naive_bayes import GNB
             self.model = GNB(**self.model_args)
         elif (self.model_type == "Decision-Tree-Classifier"):
             from sklearn.tree import DecisionTreeClassifier as DTC
             self.model = DTC(**self.model_args)
         elif (self.model_type == "Random-Forest-Classifier"):
             from sklearn.ensemble import RandomForestClassifier as RFC
             self.model = RFC(**self.model_args)

コード例 #7

0

ファイルを表示

def make_prediction(response, features, tr, ts):
    model = RFR(n_estimators=50, n_jobs=11)
    model.fit(features.loc[tr, :], response.loc[tr, 'RESPONSE'])
    results = response.loc[ts, :].copy()
    y_pr = model.predict(features.loc[ts, :])
    results['Predicted'] = y_pr
    return results

コード例 #8

0

ファイルを表示

ファイル: get_feat_imps.py プロジェクト: howardvickers/galvanize-capstone-asthma

def get_feat_imps():

    X_train, X_test, y_train, y_test = data_for_gridsearch()
    column_names = X_train.columns

    model = RFR(max_features='auto',
                max_depth=None,
                bootstrap=True,
                min_samples_leaf=5,
                min_samples_split=10,
                n_estimators=100)

    model = model.fit(X_train, y_train)

    model_params = model.get_params()
    feat_imps = model.feature_importances_

    print('model_params', model_params)
    print('feat_imps', feat_imps)

    rmse_train, rmse_test, errors_for_plot = eval_model(
        model, X_train, y_train, X_test, y_test)
    print('RMSE train/test: ', rmse_train, rmse_test)

    return model_params, feat_imps, column_names

コード例 #9

0

ファイルを表示

    def fit_state(self, X, y_data, y_state):
        self.clf_free = RFR(n_estimators=self.n_estimators,
                            criterion=self.criterion)
        self.clf_queue = RFR(n_estimators=self.n_estimators,
                             criterion=self.criterion)

        f_indices = y_state == 0
        q_indices = y_state == 1

        X_f = X[f_indices]
        y_f = y_data[f_indices]
        self.clf_free.fit(X_f, y_f)

        X_q = X[q_indices]
        y_q = y_data[q_indices]
        self.clf_queue.fit(X_q, y_q)

コード例 #10

0

ファイルを表示

def pcpower_pred_train(df_list, power_df, time_unit):
    X_np, y_np = pred_preprocess(df_list, power_df, time_unit, train=1)
    #print(list(X_np.max(axis=0)))
    #print(list(X_np.min(axis=0)))
    #return 0
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(minmax_list)
    X_minmax = minmax_scaler.transform(X_np)
    #print(X_minmax)
    nrmse_best = 1000
    ssplit = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    for train_index, test_index in ssplit.split(X_minmax, y_np):
        #model = Lasso(alpha=param_best)
        model = RFR()
        X_train, X_test = X_minmax[train_index, :], X_minmax[test_index, :]
        y_train, y_test = y_np[train_index], y_np[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        nrmse_tmp = nrmse(y_test, y_pred)
        if nrmse_tmp < nrmse_best:
            #mfile = open('/home/fc10382/Mcoder/Django/algorithm/dcpower/lasso.pkl', 'wb')
            if time_unit == 15:
                mfile = open('dcpower/model/pred_rfr.pkl', 'wb')
            elif time_unit == 10:
                mfile = open('dcpower/model/pred_rfr-10.pkl', 'wb')
            elif time_unit == 5:
                mfile = open('dcpower/model/pred_rfr-5.pkl', 'wb')
            pickle.dump(model, mfile)
            mfile.close()
            nrmse_best = nrmse_tmp
        #print(model.coef_, model.intercept_, 'NRMSE:', nrmse_tmp)
        print(model.feature_importances_, 'NRMSE:', nrmse_tmp)

コード例 #11

0

ファイルを表示

def lasso_vm2pc_train(df_list, power_df):
    df_list, power_df = data_preprocess(df_list, power_df)
    df_sum = vmsum2one(df_list)
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(minmax_list)
    X_minmax = minmax_scaler.transform(df_sum)
    #print(X_minmax)
    # X_minmax = df_sum.values
    y_np = power_df.values
    #param_best = gridsearch_lasso_best(X_minmax, y_np)['alpha']
    nrmse_best = 100
    ssplit = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    for train_index, test_index in ssplit.split(X_minmax, y_np):
        #model = Lasso(alpha=param_best)
        model = RFR()
        X_train, X_test = X_minmax[train_index, :], X_minmax[test_index, :]
        y_train, y_test = y_np[train_index], y_np[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        nrmse_tmp = nrmse(y_test, y_pred)
        if nrmse_tmp < nrmse_best:
            #mfile = open('/home/fc10382/Mcoder/Django/algorithm/dcpower/lmodel/asso.pkl', 'wb')
            mfile = open('dcpower/model/rfr.pkl', 'wb')
            pickle.dump(model, mfile)
            mfile.close()
        #print(model.coef_, model.intercept_, 'NRMSE:', nrmse_tmp)
        print(model.feature_importances_, 'NRMSE:', nrmse_tmp)

コード例 #12

0

ファイルを表示

ファイル: modelowner.py プロジェクト: Navpreet2289/code

def grid_search(
        data,
        estimator=RFR(n_estimators=40),
        param_grid={
            "max_depth": [2, 5, 10, 15],
            "min_samples_split": [20, 30, 40],
            "max_features": ['auto', 'sqrt', 'log2']
        },
        cv=5):
    """
    Build a model of type estimator with paramters prescribed by cross validated grid search. After cross validation, best estimator is built
    on parameter combination and trained on entire training set. Returns both production ready model and grid search object

    :param Data data: data object, requires (train/test)(Design/Target) attributes :py:class:`pandas.DataFrame`
    :param classifier/estimator estimator: base estimator to grid search :py:class:`sklearn.GridSearchCV`
    :param dict param_grid: paramter grid to search in grid search
    :param int cv: number of folds for cross validation

    :return: model grid data
    :rtype: tuple.(estimator, GridSearchCV, Data)
    """
    grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv)
    grid.fit(data.trainDesign, data.trainTarget)
    model = grid.best_estimator_.fit(
        pd.concat([data.trainDesign, data.testDesign]),
        pd.concat([data.trainTarget, data.testTarget]))
    return model, grid, data

コード例 #13

0

ファイルを表示

ファイル: model_selection.py プロジェクト: lagrassa/planorparam

 def __init__(self, init_states=None, init_errors=None, params_file=None):
     if params_file is None:
         n_estimators = [
             int(x) for x in np.linspace(start=20, stop=1000, num=10)
         ]
         # Number of features to consider at every split
         max_features = ['auto', 'sqrt']
         # Maximum number of levels in tree
         max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
         max_depth.append(None)
         # Minimum number of samples required to split a node
         min_samples_split = [2, 5, 10]
         # Minimum number of samples required at each leaf node
         min_samples_leaf = [1, 2, 4]
         # Method of selecting samples for training each tree
         bootstrap = [True, False]  # Create the random grid
         rf = RFR()
         random_grid = {
             'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'bootstrap': bootstrap
         }
         self.rf_random = RandomizedSearchCV(
             estimator=rf,
             param_distributions=random_grid,
             n_iter=12,
             cv=3,
             verbose=1,
             random_state=42,
             n_jobs=12)  # Fit the random search model
         if init_states is not None:
             self.train(init_states, init_errors)

コード例 #14

0

ファイルを表示

ファイル: create_simple_rf.py プロジェクト: ACasey13/shape-effects

def fit(data):
    print('loading dataset {}...'.format(data))
    X = np.load('../data/desc_{}.npy'.format(data))
    y = np.load('../data/labels_{}.npy'.format(data))

    print('scaling...')
    X = scale_descriptors(X)
    print('stripping....')
    X = strip_harmonics(X, n_h=30)
    print('separating...')
    X = sep_re_im(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print('shape of training data')
    print(X_train.shape)

    print('fitting model...')
    rfr = RFR(n_estimators=100, oob_score=True)
    rfr.fit(X_train, y_train)
    preds_train = rfr.predict(X_train)
    preds_test = rfr.predict(X_test)

    print('oob score')
    print(rfr.oob_score_)

    print('train and test scores')
    print(r2_score(y_train, preds_train))
    print(r2_score(y_test, preds_test))

    dump(rfr, 'rfr_{}.joblib'.format(data))

コード例 #15

0

ファイルを表示

ファイル: init_data.py プロジェクト: matsumototakuya/workshop_repository

def init_data():
    # データの読み込み
    dataset = pd.read_excel('basutienn.xlsx')

    # 教師データとテストデータに分割
    train_data, test_data, train_target, test_target = train_test_split(
        dataset.iloc[:, 1:5],
        dataset.iloc[:, 5],
        test_size=0.3,
        random_state=0)

    rg = RFR(n_jobs=1, random_state=0, n_estimators=5)  # randomforest

    rg.fit(train_data, train_target)
    pred = rg.predict(test_data)

    # 学習済みモデルの保存
    joblib.dump(rg, "rf.pkl", compress=True)

    # 予測精度
    print("result: ", rg.score(test_data, test_target))

    # データの保存
    data = dataset.iloc[:, 1:5].values
    target = dataset.iloc[:, 5].values
    np.save("data", data)
    np.save("target", target)

コード例 #16

0

ファイルを表示

def grid_search(X, y):
    '''
    cross validated grid search using Ridge Regressor and Random
    Forest Regressor
    '''

    nids = df_subset.index
    titles = df_subset['title']

    pars = {
        'alpha': [
            0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1, 0.08, 0.07, 0.06, 0.05, 0.04,
            0.03, 0.02
        ]
    }

    gs = GridSearchCV(Ridge(), pars, cv=5)
    gs.fit(X, y)

    ridge = gs.best_estimator_
    dill.dump(ridge, open('ridge.pkl', 'wb'))

    pars = {
        'max_depth': [5, 8, 10, 20, 50, 100],
        'min_samples_split': [2, 3, 5, 10, 20]
    }

    gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2),
                      pars,
                      cv=5)
    rfr = gs.best_estimator_
    dill.dump(rfr, open('rfr.pkl', 'wb'))
    return ridge, rfr

コード例 #17

0

ファイルを表示

ファイル: countingsvr.py プロジェクト: yutiansut/ilastik

    def _fit(self, image, dot, tags, boxConstraints=[]):
        img = self.normalize(image)
        if type(boxConstraints) is dict:
            boxConstraints["boxFeatures"] = self.normalize(
                boxConstraints["boxFeatures"])
        numFeatures = img.shape[1]
        if self._method == "RandomForest":
            from sklearn.ensemble import RandomForestRegressor as RFR

            regressor = RFR(n_estimators=self._ntrees,
                            max_depth=self._maxdepth)
            regressor.fit(img, dot)

        elif self._method == "svrBoxed-gurobi":
            regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon)
            regressor.fit(
                img, dot, tags,
                self.getOldBoxConstraints(boxConstraints, numFeatures))
        elif self._method == "svrBoxed-gurobi":
            regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon)
            regressor.fit(
                img, dot, tags,
                self.getOldBoxConstraints(boxConstraints, numFeatures))

        return regressor

コード例 #18

0

ファイルを表示

ファイル: modclass2.py プロジェクト: howardvickers/galvanize-capstone-asthma

 def __init__(self):
     # self._vectorizer = TfidfVectorizer(stop_words='english')
     self._regressor = RFR(max_features='sqrt',
                           max_depth=100,
                           bootstrap=False,
                           min_samples_leaf=1,
                           min_samples_split=2,
                           n_estimators=200)

コード例 #19

0

ファイルを表示

ファイル: util.py プロジェクト: elizabetabudini/traffic-prediction-python

def rfr_cv(n_estimators, max_features, data, targets):
    # using https://github.com/fmfn/BayesianOptimization
    estimator = RFR(
        n_estimators=n_estimators,
        max_features=max_features,
    )
    cval = cross_val_score(estimator, data, targets, scoring='r2', cv=4)
    return cval.mean()

コード例 #20

0

ファイルを表示

class Algorithms(Enum):
    RandomForestRegressor = RFR()
    MLPRegressor = MLPR()
    KNeighborsRegressor = KNR()
    Ridge = RR()
    Lasso = LR()

    def __str__(self):
        return self.name

コード例 #21

0

ファイルを表示

def model(X_train,
          y_train,
          X_test=np.array([]),
          y_test=np.array([]),
          method="LR"):
    #X_train inputs of model for training
    #X_test inputs of model fortesting
    #y_train -outputs for Xtrain
    #y_test - outputs fo X_test
    #method of model design. Default method is linear regression

    if method == "LR":
        lr = LR()
    elif method == "Ridge":
        lr = Ridge()
    elif method == "Lasso":
        lr = Lasso()
    elif method == "MLPRegressor":
        lr = MLPRegressor()
    elif method == "SVR":
        lr = SVR()
    elif method == "KNR":
        lr = KNR()
    elif method == "RFR":
        lr = RFR()
    elif method == "GBR":
        lr = GBR()
    else:
        print("unknown method")
        return False

#    lr = MLPRegressor( hidden_layer_sizes=[5], activation ="relu")
#    lr = MLPRegressor()
#    lr=SVR()
#    lr=KNR()
#
#    lr=Ridge(alpha=alpha.x)
#    lr=Ridge()
#    lr=Lasso(alpha=0.001)

#    lr=Lasso()

#    lr=RFR(n_estimators=5, max_features=2, max_depth=2, random_state=2)
#    lr=RFR()

#    lr=GBR()

    lr = lr.fit(X_train, y_train[:, 0])
    y_mod_train = lr.predict(X_train)
    c_train = CCC(y_train, y_mod_train[:, np.newaxis])

    c_test = -1
    if len(y_test) > 0:
        y_mod_test = lr.predict(X_test)
        c_test = CCC(y_test, y_mod_test[:, np.newaxis])

    return (lr, c_train, c_test)

コード例 #22

0

ファイルを表示

def rfrcv(n_estimators, min_samples_split, max_features):
    val = cross_val_score(RFR(n_estimators=int(n_estimators),
                              min_samples_split=int(min_samples_split),
                              max_features=min(max_features, 0.999),
                              random_state=42),
                          X_train,
                          y_train,
                          cv=2).mean()
    return val

コード例 #23

0

ファイルを表示

 def __init__(self,
              n_estimators=100,
              criterion='friedman_mse',
              max_depth=None,
              min_samples_split=2,
              min_samples_leaf=1,
              min_weight_fraction_leaf=0.0,
              max_features='auto',
              max_leaf_nodes=None,
              min_impurity_decrease=0.0,
              min_impurity_split=None,
              bootstrap=True,
              oob_score=False,
              n_jobs=None,
              random_state=None,
              verbose=0,
              warm_start=False,
              ccp_alpha=0.0,
              max_samples=None):
     self.max_samples = max_samples
     self.max_leaf_nodes = max_leaf_nodes
     self.max_features = max_features
     self.bootstrap = bootstrap
     self.min_samples_split = min_samples_split
     self.random_state = random_state
     self.min_samples_leaf = min_samples_leaf
     self.ccp_alpha = ccp_alpha
     self.min_impurity_decrease = min_impurity_decrease
     self.criterion = criterion
     self.n_jobs = n_jobs
     self.max_depth = max_depth
     self.warm_start = warm_start
     self.oob_score = oob_score
     self.verbose = verbose
     self.n_estimators = n_estimators
     self.min_weight_fraction_leaf = min_weight_fraction_leaf
     self.min_impurity_split = min_impurity_split
     self.model = RFR(
         ccp_alpha=self.ccp_alpha,
         bootstrap=self.bootstrap,
         min_impurity_decrease=self.min_impurity_decrease,
         min_weight_fraction_leaf=self.min_weight_fraction_leaf,
         min_impurity_split=self.min_impurity_split,
         max_depth=self.max_depth,
         min_samples_split=self.min_samples_split,
         max_leaf_nodes=self.max_leaf_nodes,
         n_estimators=self.n_estimators,
         min_samples_leaf=self.min_samples_leaf,
         max_features=self.max_features,
         oob_score=self.oob_score,
         max_samples=self.max_samples,
         verbose=self.verbose,
         warm_start=self.warm_start,
         n_jobs=self.n_jobs,
         criterion=self.criterion,
         random_state=self.random_state)

コード例 #24

0

ファイルを表示

 def split(model, data, t1, t2, path):
     for k in data:
         # print(len(data[k]))
         for row in data[k]:
             if model.predict(row[1:].reshape(1, -1)):
                 # print(row[1:].reshape(1, -1))
                 if len(t1) > 0:
                     # print(t1.shape)
                     t1 = np.r_[t1, np.array(row).reshape(1, -1)]
                 else:
                     t1 = np.array(row).reshape(1, -1)
             else:
                 if len(t2) > 0:
                     # print(t2.shape)
                     t2 = np.r_[t2, np.array(row).reshape(1, -1)]
                 else:
                     t2 = np.array(row).reshape(1, -1)
     print(len(t1) + len(t2))
     if len(t1) > 0:
         np.random.shuffle(t1)
         test1 = t1[:, 0]
         train1 = t1[:, 1:]
         params_high = {
             'n_estimators': 1000,
             'max_depth': 10,
             'min_samples_split': 2,
             'learning_rate': 0.01,
             'loss': 'huber'
         }
         # one_model = GBR(**params_high)
         one_model = RFR()
         one_model.fit(train1, test1.T)
         with open('./pkls/' + str(year) + path + '_1.pkl', 'wb') as f:
             pkl.dump(one_model, f)
     if len(t2) > 0:
         np.random.shuffle(t2)
         test2 = t2[:, 0]
         train2 = t2[:, 1:]
         # zero_model = GBR(**params_high)
         zero_model = RFR()
         zero_model.fit(train2, test2.T)
         with open('./pkls/' + str(year) + path + '_0.pkl', 'wb') as f:
             pkl.dump(zero_model, f)

コード例 #25

0

ファイルを表示

def __ensemble_test(type, X_train, X_test, y_train, y_test):
    if type.lower() == 'gbr':
        reg = GBR(n_estimators=100, random_state=1)
    elif type.lower() == 'rfr':
        reg = RFR(n_estimators=100, random_state=1)
    elif type.lower() == 'abr':
        reg = ABR(n_estimators=100, random_state=1)
    elif type.lower() == 'etr':
        reg = ETR(n_estimators=100, random_state=1)
    reg.fit(X_train, y_train)
    return reg, reg.score(X_test, y_test), reg.feature_importances_

コード例 #26

0

ファイルを表示

def rand_for(df):
    df2 = df.filter(items=[
                          'price', 'security_deposit', 'accomodates',
                           'bedrooms', 'bathrooms', 'property_type',
                           'room_type', 'latitude', 'longitude',
                           'housing_type','price_bin','amount'
                           'areas','Complement_of_Availability_Next_90_Days','cleaning_fee'])
    print(df2.head(),len(df2))
    df3 = pd.DataFrame()
    # Random Forest ************************************************************************************************
    # for i in df2.price_bin.unique():
    #     t = df2[df2.price_bin==i]
    t = df2
    t =t.fillna(0)
    t = pd.get_dummies(t)
    print(t.columns,len(t))
    y = t.pop('price').values
    X = t.values
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    rf = RFR(n_estimators=500)
    mod = rf.fit(X_train, y_train,)
    rf_rmse = '%.2f'%np.sqrt(mse(y_test,rf.predict(X_test)))
    print('rmse:',rf_rmse)
    rf_score = '%.3f'%rf.score(X_test, y_test)
    print("Random Forest score:", rf_score)
    imp =  (rf.feature_importances_)
    ord = np.argsort(rf.feature_importances_)[::-1]
    _cols = t.columns.tolist()
    imp_cols = ord[:6]
    # feats = _cols[imp_cols]
    feats = []
    for i in range(len(imp_cols)):
        for j in _cols:
            if _cols.index(j) == imp_cols[i]:
                feats.append(j)
    print(feats)
    x = sorted(imp,reverse=True)[:6]
    imp_feats = {}
    for i in range(len(feats)):
        imp_feats.update({feats[i]:'%.4f'%x[i]})
    print(imp_feats)
    # breakpoint()
    tempdf = pd.DataFrame.from_dict(imp_feats,orient='index').T

    df3 = df3.append(tempdf,sort=True)
    print(df3)
# df3.to_csv('feature_importance_table.csv')
    # x = np.array(df.columns.tolist())[idx]
    # y = np.array(x)[idx]
    #     model = sm.OLS(y_train, X_train)
    #     results = model.fit()
    #     model.predict(X_test,y_test)
    #     print(results.summary())
    return (imp_cols,_cols,imp,imp_feats,rf_rmse,rf_score)

コード例 #27

0

ファイルを表示

    def random_forest(self):

        print('Random_Forest')

        rg = RFR(n_jobs=-1, n_estimators=100, random_state=100)
        rg.fit(self.X_train, self.y_train)

        importances = pd.DataFrame({'RF': rg.feature_importances_},
                                   index=self.X_train.columns)
        importances = _norm(importances)

        return rg, importances

コード例 #28

0

ファイルを表示

    def predict(self):
        regr_rf = RFR(max_depth=17, random_state=9, n_estimators=50, n_jobs=-1)
        regr_rf.fit(self.x_train, self.y_train)
        train_result = regr_rf.predict(self.x_train)
        test_result = regr_rf.predict(self.x_test)

        export_filename = 'RandomForestReg'
        if self.drop_feature_names:
            export_filename += '_without_' + '_'.join(self.drop_feature_names)

        BaseModel.export_prediction(test_result, export_filename)
        return (train_result, test_result)

コード例 #29

0

ファイルを表示

ファイル: rf+lr.py プロジェクト: linshandl/user-loan-credit-predict

def fill_missing(df):
    all_df = df.iloc[:,[4,0,1,2,3,5,6,7,8]]    #第4列表示月收入，去除家属数量
    # df.head()
    known = all_df[all_df.月收入.notnull()].as_matrix()
    unknown = all_df[all_df.月收入.isnull()].as_matrix()
    X = known[:,1:]
    Y = known[:,0]
    rfr = RFR(random_state=0,n_estimators=200,max_depth=3)
    rfr.fit(X,Y)
    predict = rfr.predict(unknown[:,1:]).round(0)
    df.loc[(df.月收入.isnull()),'月收入'] = predict
    return df

コード例 #30

0

ファイルを表示

def rfrcv(n_estimators, min_samples_split, max_features, max_depth):
    return cross_val_score(RFR(n_estimators=int(n_estimators),
                               min_samples_split=int(min_samples_split),
                               max_features=min(max_features, 0.999),
                               max_depth=int(max_depth),
                               random_state=2016,
                               n_jobs=6),
                           X,
                           y,
                           scoring=score,
                           n_jobs=3,
                           cv=3).mean()