Esempio n. 1
0
def random_forest(train_set, test_set):
    ''' Creates, trains and tests a random forest regressor, then writes results
    to terminal.

    Params:
    train_set: A list with training data.
    test_set: A list with test data.
    '''
    clf_energy = RFR(n_jobs=2, n_estimators=10)
    clf_happiness = RFR(n_jobs=2, n_estimators=10)

    # Fit the regressor models on the spotify statistics, fit them on the
    # mood stats determined by another research project.
    clf_energy.fit([row[3:16] for row in train_set], [row[1] for row in train_set])
    clf_happiness.fit([row[3:16] for row in train_set], [row[2] for row in train_set])

    result_energy = clf_energy.predict([row[3:16] for row in test_set])
    result_happiness = clf_happiness.predict([row[3:16] for row in test_set])

    # Compute the total absolute difference between the predicted and actual
    # moods.
    energy_mean = 0.0
    happiness_mean = 0.0
    for i in range(len(test_set)):
        energy_mean += abs(float(result_energy[i]) - float(test_set[i][1]))
        happiness_mean += abs(float(result_happiness[i]) - float(test_set[i][2]))

    energy_mean /= len(test_set)
    happiness_mean /= len(test_set)

    print("Avg discrepancy - Energy: " + str(energy_mean))
    print("Avg discrepancy - Happiness: " + str(happiness_mean))
Esempio n. 2
0
def rfr_fillna(df_all):
    '''
    func:对于原来的表格进行缺失值填充,使用的方法是随机森林
    
    paramas: 
    df_all:原来需要填充的表格
    
    return:df_adda(新的表格),model(填充模型),MinMax_1st(归一化模型1),MinMax_2nd(归一化模型2)
    '''

    # 将数据分段,选择好要进行预测的因变量和自变量
    user_id = df_all.iloc[:, 0]
    X = df_all.iloc[:, 1:-1]
    Y = df_all.iloc[:, -1]

    X1 = X.copy()
    Y2 = X1.iloc[:, 43:]
    sex = X1.iloc[:, 0]
    X2 = X1.iloc[:, 1:43]

    # 量纲归一化
    MinMax_1st = MinMaxScaler().fit(X2)
    X2.iloc[:, :] = MinMax_1st.transform(X2)

    X2 = pd.concat([sex, X2], axis=1)
    # 对于模型进行筛选
    model = {}
    krange = range(4, 30)
    for k in tqdm(list(Y2)):
        X_train = X2[Y2[k].notnull()]
        X_test = X2[Y2[k].isnull()]
        Y_train = Y2[k][Y2[k].notnull()]
        score = []
        for i in krange:
            rfr = RFR(min_samples_split=i, n_jobs=-1)
            score_each = cvs(rfr, X_train, Y_train, cv=3, n_jobs=-1).mean()
            score.append(score_each)
        best_choose = list(krange)[np.argmax(score)]
        rfr = RFR(min_samples_split=best_choose, n_jobs=-1)
        rfr = rfr.fit(X_train, Y_train)
        model[k] = rfr
        Y2[k][Y2[k].isnull()] = rfr.predict(X_test)

    # 对银行流水表再次量纲归一化
    MinMax_2nd = MinMaxScaler().fit(Y2)
    Y2.iloc[:, :] = MinMax_2nd.transform(Y2)

    df_adda = pd.concat([X2, Y2], axis=1)

    df_adda = pd.concat([user_id, df_adda, Y], axis=1)
    return df_adda, model, MinMax_1st, MinMax_2nd
Esempio n. 3
0
    def na_rf_interp(self,
                     my_data,
                     na_variables,
                     features='all',
                     rf_params=None):

        if not rf_params:
            rf = RFR()
        else:
            rf = RFR(rf_params)

        if type(my_data).__name__ == 'dict':
            my_data = pd.DataFrame.from_dict(my_data)

        # parse features
        if type(features
                ).__name__ == 'str' or type(features).__name__ != 'dict':
            features_ = {}
            for t in na_variables:
                if features == 'all':
                    features_[t] = [
                        x for x in my_data if not x in na_variables
                    ]
                elif type(features).__name__ == 'str':
                    features_[t] = [features]
                else:
                    features_[t] = features
            features = features_

        my_rfs = {}
        for f in na_variables:
            rf_ = rf

            #  nans
            id_na = np.isnan(my_data[f])
            if id_na.sum() in [0, my_data[f].size]:
                continue  # nothing to interpolate

            rf_.fit(
                my_data.filter(features[f])[(id_na - 1).astype(bool)].values,
                my_data[f][(id_na - 1).astype(bool)].values,
            )

            my_data[f][id_na] = rf_.predict(
                my_data.filter(features[f])[id_na].values)

            my_rfs[f] = rf_
            rf_ = None

        return my_data, my_rfs
Esempio n. 4
0
 def setup_random_forest(self):
     n_estimators = [
         int(x) for x in np.linspace(start=20, stop=1000, num=10)
     ]
     # Number of features to consider at every split
     max_features = ['auto', 'sqrt']
     # Maximum number of levels in tree
     max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
     max_depth.append(None)
     # Minimum number of samples required to split a node
     min_samples_split = [2, 5, 10]
     # Minimum number of samples required at each leaf node
     min_samples_leaf = [1, 2, 4]
     # Method of selecting samples for training each tree
     bootstrap = [True, False]  # Create the random grid
     rf = RFR()
     random_grid = {
         'n_estimators': n_estimators,
         'max_features': max_features,
         'max_depth': max_depth,
         'min_samples_split': min_samples_split,
         'min_samples_leaf': min_samples_leaf,
         'bootstrap': bootstrap
     }
     return RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid,
                               n_iter=30,
                               cv=3,
                               verbose=0,
                               random_state=42,
                               n_jobs=12)
Esempio n. 5
0
    def _fit(self, img, dot, tags, boxConstraints=[]):

        numFeatures = img.shape[1]
        if self._method == "RandomForest":
            from sklearn.ensemble import RandomForestRegressor as RFR

            regressor = RFR(n_estimators=self._ntrees,
                            max_depth=self._maxdepth)
            regressor.fit(img, dot)

        elif self._method == "svrBoxed-gurobi":
            regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon)
            regressor.fit(
                img, dot, tags,
                self.getOldBoxConstraints(boxConstraints, numFeatures))
        #elif self._method == "svrBoxed-gurobi":
        #    regressor = RegressorGurobi(C = self._C, epsilon = self._epsilon)
        #    regressor.fit(img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures
        #                                                           ))
        elif self._method == "BoxedRegressionGurobi":
            regressor = RegressorC(C=self._C, epsilon=self._epsilon)
            regressor.fitgurobi(img, dot, tags, boxConstraints)

        elif self._method == "BoxedRegressionCplex":
            regressor = RegressorC(C=self._C, epsilon=self._epsilon)
            regressor.fitcplex(img, dot, tags, boxConstraints)

        return regressor
 def get_new_model(self):
     if (self.model_type.split("_")[-1] == "Regressor"):
         if (self.model_type == "Linear-Regressor"):
             from sklearn.linear_model import LinearRegression
             self.model = LinearRegression(**self.model_args)
         elif (self.model_type == "Support-Vector-Regressor"):
             import sklearn.svm as SVR
             self.model = SVR(**self.model_args)
         elif (self.model_type == "Decision-Tree-Regressor"):
             from sklearn.tree import DecisionTreeRegressor as DTR
             self.model = DTR(**self.model_args)
         elif (self.model_type == "Random-Forest-Regressor"):
             from sklearn.ensemble import RandomForestRegressor as RFR
             self.model = RFR(**self.model_args)
     else:
         if (self.model_type == "Logistic-Regression-Classifier"):
             from sklearn.linear_model import LogisticRegression
             self.model = LogisticRegression(**self.model_args)
         elif (self.model_type == "KNN-Classifier"):
             from sklearn.neighbors import KNeighborsClassifier as KNN
             self.model = KNN(**self.model_args)
         elif (self.model_type == "Support-Vector-Classifier"):
             import sklearn.svm as SVC
             self.model = SVC(**self.model_args)
         elif (self.model_type == "Naive-Bayes-Classifier"):
             from sklearn.naive_bayes import GNB
             self.model = GNB(**self.model_args)
         elif (self.model_type == "Decision-Tree-Classifier"):
             from sklearn.tree import DecisionTreeClassifier as DTC
             self.model = DTC(**self.model_args)
         elif (self.model_type == "Random-Forest-Classifier"):
             from sklearn.ensemble import RandomForestClassifier as RFC
             self.model = RFC(**self.model_args)
Esempio n. 7
0
def make_prediction(response, features, tr, ts):
    model = RFR(n_estimators=50, n_jobs=11)
    model.fit(features.loc[tr, :], response.loc[tr, 'RESPONSE'])
    results = response.loc[ts, :].copy()
    y_pr = model.predict(features.loc[ts, :])
    results['Predicted'] = y_pr
    return results
def get_feat_imps():

    X_train, X_test, y_train, y_test = data_for_gridsearch()
    column_names = X_train.columns

    model = RFR(max_features='auto',
                max_depth=None,
                bootstrap=True,
                min_samples_leaf=5,
                min_samples_split=10,
                n_estimators=100)

    model = model.fit(X_train, y_train)

    model_params = model.get_params()
    feat_imps = model.feature_importances_

    print('model_params', model_params)
    print('feat_imps', feat_imps)

    rmse_train, rmse_test, errors_for_plot = eval_model(
        model, X_train, y_train, X_test, y_test)
    print('RMSE train/test: ', rmse_train, rmse_test)

    return model_params, feat_imps, column_names
Esempio n. 9
0
    def fit_state(self, X, y_data, y_state):
        self.clf_free = RFR(n_estimators=self.n_estimators,
                            criterion=self.criterion)
        self.clf_queue = RFR(n_estimators=self.n_estimators,
                             criterion=self.criterion)

        f_indices = y_state == 0
        q_indices = y_state == 1

        X_f = X[f_indices]
        y_f = y_data[f_indices]
        self.clf_free.fit(X_f, y_f)

        X_q = X[q_indices]
        y_q = y_data[q_indices]
        self.clf_queue.fit(X_q, y_q)
Esempio n. 10
0
def pcpower_pred_train(df_list, power_df, time_unit):
    X_np, y_np = pred_preprocess(df_list, power_df, time_unit, train=1)
    #print(list(X_np.max(axis=0)))
    #print(list(X_np.min(axis=0)))
    #return 0
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(minmax_list)
    X_minmax = minmax_scaler.transform(X_np)
    #print(X_minmax)
    nrmse_best = 1000
    ssplit = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    for train_index, test_index in ssplit.split(X_minmax, y_np):
        #model = Lasso(alpha=param_best)
        model = RFR()
        X_train, X_test = X_minmax[train_index, :], X_minmax[test_index, :]
        y_train, y_test = y_np[train_index], y_np[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        nrmse_tmp = nrmse(y_test, y_pred)
        if nrmse_tmp < nrmse_best:
            #mfile = open('/home/fc10382/Mcoder/Django/algorithm/dcpower/lasso.pkl', 'wb')
            if time_unit == 15:
                mfile = open('dcpower/model/pred_rfr.pkl', 'wb')
            elif time_unit == 10:
                mfile = open('dcpower/model/pred_rfr-10.pkl', 'wb')
            elif time_unit == 5:
                mfile = open('dcpower/model/pred_rfr-5.pkl', 'wb')
            pickle.dump(model, mfile)
            mfile.close()
            nrmse_best = nrmse_tmp
        #print(model.coef_, model.intercept_, 'NRMSE:', nrmse_tmp)
        print(model.feature_importances_, 'NRMSE:', nrmse_tmp)
Esempio n. 11
0
def lasso_vm2pc_train(df_list, power_df):
    df_list, power_df = data_preprocess(df_list, power_df)
    df_sum = vmsum2one(df_list)
    minmax_scaler = MinMaxScaler()
    minmax_scaler.fit(minmax_list)
    X_minmax = minmax_scaler.transform(df_sum)
    #print(X_minmax)
    # X_minmax = df_sum.values
    y_np = power_df.values
    #param_best = gridsearch_lasso_best(X_minmax, y_np)['alpha']
    nrmse_best = 100
    ssplit = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    for train_index, test_index in ssplit.split(X_minmax, y_np):
        #model = Lasso(alpha=param_best)
        model = RFR()
        X_train, X_test = X_minmax[train_index, :], X_minmax[test_index, :]
        y_train, y_test = y_np[train_index], y_np[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        nrmse_tmp = nrmse(y_test, y_pred)
        if nrmse_tmp < nrmse_best:
            #mfile = open('/home/fc10382/Mcoder/Django/algorithm/dcpower/lmodel/asso.pkl', 'wb')
            mfile = open('dcpower/model/rfr.pkl', 'wb')
            pickle.dump(model, mfile)
            mfile.close()
        #print(model.coef_, model.intercept_, 'NRMSE:', nrmse_tmp)
        print(model.feature_importances_, 'NRMSE:', nrmse_tmp)
Esempio n. 12
0
def grid_search(
        data,
        estimator=RFR(n_estimators=40),
        param_grid={
            "max_depth": [2, 5, 10, 15],
            "min_samples_split": [20, 30, 40],
            "max_features": ['auto', 'sqrt', 'log2']
        },
        cv=5):
    """
    Build a model of type estimator with paramters prescribed by cross validated grid search. After cross validation, best estimator is built
    on parameter combination and trained on entire training set. Returns both production ready model and grid search object

    :param Data data: data object, requires (train/test)(Design/Target) attributes :py:class:`pandas.DataFrame`
    :param classifier/estimator estimator: base estimator to grid search :py:class:`sklearn.GridSearchCV`
    :param dict param_grid: paramter grid to search in grid search
    :param int cv: number of folds for cross validation

    :return: model grid data
    :rtype: tuple.(estimator, GridSearchCV, Data)
    """
    grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv)
    grid.fit(data.trainDesign, data.trainTarget)
    model = grid.best_estimator_.fit(
        pd.concat([data.trainDesign, data.testDesign]),
        pd.concat([data.trainTarget, data.testTarget]))
    return model, grid, data
Esempio n. 13
0
 def __init__(self, init_states=None, init_errors=None, params_file=None):
     if params_file is None:
         n_estimators = [
             int(x) for x in np.linspace(start=20, stop=1000, num=10)
         ]
         # Number of features to consider at every split
         max_features = ['auto', 'sqrt']
         # Maximum number of levels in tree
         max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
         max_depth.append(None)
         # Minimum number of samples required to split a node
         min_samples_split = [2, 5, 10]
         # Minimum number of samples required at each leaf node
         min_samples_leaf = [1, 2, 4]
         # Method of selecting samples for training each tree
         bootstrap = [True, False]  # Create the random grid
         rf = RFR()
         random_grid = {
             'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'bootstrap': bootstrap
         }
         self.rf_random = RandomizedSearchCV(
             estimator=rf,
             param_distributions=random_grid,
             n_iter=12,
             cv=3,
             verbose=1,
             random_state=42,
             n_jobs=12)  # Fit the random search model
         if init_states is not None:
             self.train(init_states, init_errors)
Esempio n. 14
0
def fit(data):
    print('loading dataset {}...'.format(data))
    X = np.load('../data/desc_{}.npy'.format(data))
    y = np.load('../data/labels_{}.npy'.format(data))

    print('scaling...')
    X = scale_descriptors(X)
    print('stripping....')
    X = strip_harmonics(X, n_h=30)
    print('separating...')
    X = sep_re_im(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print('shape of training data')
    print(X_train.shape)

    print('fitting model...')
    rfr = RFR(n_estimators=100, oob_score=True)
    rfr.fit(X_train, y_train)
    preds_train = rfr.predict(X_train)
    preds_test = rfr.predict(X_test)

    print('oob score')
    print(rfr.oob_score_)

    print('train and test scores')
    print(r2_score(y_train, preds_train))
    print(r2_score(y_test, preds_test))

    dump(rfr, 'rfr_{}.joblib'.format(data))
def init_data():
    # データの読み込み
    dataset = pd.read_excel('basutienn.xlsx')

    # 教師データとテストデータに分割
    train_data, test_data, train_target, test_target = train_test_split(
        dataset.iloc[:, 1:5],
        dataset.iloc[:, 5],
        test_size=0.3,
        random_state=0)

    rg = RFR(n_jobs=1, random_state=0, n_estimators=5)  # randomforest

    rg.fit(train_data, train_target)
    pred = rg.predict(test_data)

    # 学習済みモデルの保存
    joblib.dump(rg, "rf.pkl", compress=True)

    # 予測精度
    print("result: ", rg.score(test_data, test_target))

    # データの保存
    data = dataset.iloc[:, 1:5].values
    target = dataset.iloc[:, 5].values
    np.save("data", data)
    np.save("target", target)
Esempio n. 16
0
def grid_search(X, y):
    '''
    cross validated grid search using Ridge Regressor and Random
    Forest Regressor
    '''

    nids = df_subset.index
    titles = df_subset['title']

    pars = {
        'alpha': [
            0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1, 0.08, 0.07, 0.06, 0.05, 0.04,
            0.03, 0.02
        ]
    }

    gs = GridSearchCV(Ridge(), pars, cv=5)
    gs.fit(X, y)

    ridge = gs.best_estimator_
    dill.dump(ridge, open('ridge.pkl', 'wb'))

    pars = {
        'max_depth': [5, 8, 10, 20, 50, 100],
        'min_samples_split': [2, 3, 5, 10, 20]
    }

    gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2),
                      pars,
                      cv=5)
    rfr = gs.best_estimator_
    dill.dump(rfr, open('rfr.pkl', 'wb'))
    return ridge, rfr
Esempio n. 17
0
    def _fit(self, image, dot, tags, boxConstraints=[]):
        img = self.normalize(image)
        if type(boxConstraints) is dict:
            boxConstraints["boxFeatures"] = self.normalize(
                boxConstraints["boxFeatures"])
        numFeatures = img.shape[1]
        if self._method == "RandomForest":
            from sklearn.ensemble import RandomForestRegressor as RFR

            regressor = RFR(n_estimators=self._ntrees,
                            max_depth=self._maxdepth)
            regressor.fit(img, dot)

        elif self._method == "svrBoxed-gurobi":
            regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon)
            regressor.fit(
                img, dot, tags,
                self.getOldBoxConstraints(boxConstraints, numFeatures))
        elif self._method == "svrBoxed-gurobi":
            regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon)
            regressor.fit(
                img, dot, tags,
                self.getOldBoxConstraints(boxConstraints, numFeatures))

        return regressor
 def __init__(self):
     # self._vectorizer = TfidfVectorizer(stop_words='english')
     self._regressor = RFR(max_features='sqrt',
                           max_depth=100,
                           bootstrap=False,
                           min_samples_leaf=1,
                           min_samples_split=2,
                           n_estimators=200)
def rfr_cv(n_estimators, max_features, data, targets):
    # using https://github.com/fmfn/BayesianOptimization
    estimator = RFR(
        n_estimators=n_estimators,
        max_features=max_features,
    )
    cval = cross_val_score(estimator, data, targets, scoring='r2', cv=4)
    return cval.mean()
Esempio n. 20
0
class Algorithms(Enum):
    RandomForestRegressor = RFR()
    MLPRegressor = MLPR()
    KNeighborsRegressor = KNR()
    Ridge = RR()
    Lasso = LR()

    def __str__(self):
        return self.name
Esempio n. 21
0
def model(X_train,
          y_train,
          X_test=np.array([]),
          y_test=np.array([]),
          method="LR"):
    #X_train inputs of model for training
    #X_test inputs of model fortesting
    #y_train -outputs for Xtrain
    #y_test - outputs fo X_test
    #method of model design. Default method is linear regression

    if method == "LR":
        lr = LR()
    elif method == "Ridge":
        lr = Ridge()
    elif method == "Lasso":
        lr = Lasso()
    elif method == "MLPRegressor":
        lr = MLPRegressor()
    elif method == "SVR":
        lr = SVR()
    elif method == "KNR":
        lr = KNR()
    elif method == "RFR":
        lr = RFR()
    elif method == "GBR":
        lr = GBR()
    else:
        print("unknown method")
        return False

#    lr = MLPRegressor( hidden_layer_sizes=[5], activation ="relu")
#    lr = MLPRegressor()
#    lr=SVR()
#    lr=KNR()
#
#    lr=Ridge(alpha=alpha.x)
#    lr=Ridge()
#    lr=Lasso(alpha=0.001)

#    lr=Lasso()

#    lr=RFR(n_estimators=5, max_features=2, max_depth=2, random_state=2)
#    lr=RFR()

#    lr=GBR()

    lr = lr.fit(X_train, y_train[:, 0])
    y_mod_train = lr.predict(X_train)
    c_train = CCC(y_train, y_mod_train[:, np.newaxis])

    c_test = -1
    if len(y_test) > 0:
        y_mod_test = lr.predict(X_test)
        c_test = CCC(y_test, y_mod_test[:, np.newaxis])

    return (lr, c_train, c_test)
Esempio n. 22
0
def rfrcv(n_estimators, min_samples_split, max_features):
    val = cross_val_score(RFR(n_estimators=int(n_estimators),
                              min_samples_split=int(min_samples_split),
                              max_features=min(max_features, 0.999),
                              random_state=42),
                          X_train,
                          y_train,
                          cv=2).mean()
    return val
Esempio n. 23
0
 def __init__(self,
              n_estimators=100,
              criterion='friedman_mse',
              max_depth=None,
              min_samples_split=2,
              min_samples_leaf=1,
              min_weight_fraction_leaf=0.0,
              max_features='auto',
              max_leaf_nodes=None,
              min_impurity_decrease=0.0,
              min_impurity_split=None,
              bootstrap=True,
              oob_score=False,
              n_jobs=None,
              random_state=None,
              verbose=0,
              warm_start=False,
              ccp_alpha=0.0,
              max_samples=None):
     self.max_samples = max_samples
     self.max_leaf_nodes = max_leaf_nodes
     self.max_features = max_features
     self.bootstrap = bootstrap
     self.min_samples_split = min_samples_split
     self.random_state = random_state
     self.min_samples_leaf = min_samples_leaf
     self.ccp_alpha = ccp_alpha
     self.min_impurity_decrease = min_impurity_decrease
     self.criterion = criterion
     self.n_jobs = n_jobs
     self.max_depth = max_depth
     self.warm_start = warm_start
     self.oob_score = oob_score
     self.verbose = verbose
     self.n_estimators = n_estimators
     self.min_weight_fraction_leaf = min_weight_fraction_leaf
     self.min_impurity_split = min_impurity_split
     self.model = RFR(
         ccp_alpha=self.ccp_alpha,
         bootstrap=self.bootstrap,
         min_impurity_decrease=self.min_impurity_decrease,
         min_weight_fraction_leaf=self.min_weight_fraction_leaf,
         min_impurity_split=self.min_impurity_split,
         max_depth=self.max_depth,
         min_samples_split=self.min_samples_split,
         max_leaf_nodes=self.max_leaf_nodes,
         n_estimators=self.n_estimators,
         min_samples_leaf=self.min_samples_leaf,
         max_features=self.max_features,
         oob_score=self.oob_score,
         max_samples=self.max_samples,
         verbose=self.verbose,
         warm_start=self.warm_start,
         n_jobs=self.n_jobs,
         criterion=self.criterion,
         random_state=self.random_state)
Esempio n. 24
0
 def split(model, data, t1, t2, path):
     for k in data:
         # print(len(data[k]))
         for row in data[k]:
             if model.predict(row[1:].reshape(1, -1)):
                 # print(row[1:].reshape(1, -1))
                 if len(t1) > 0:
                     # print(t1.shape)
                     t1 = np.r_[t1, np.array(row).reshape(1, -1)]
                 else:
                     t1 = np.array(row).reshape(1, -1)
             else:
                 if len(t2) > 0:
                     # print(t2.shape)
                     t2 = np.r_[t2, np.array(row).reshape(1, -1)]
                 else:
                     t2 = np.array(row).reshape(1, -1)
     print(len(t1) + len(t2))
     if len(t1) > 0:
         np.random.shuffle(t1)
         test1 = t1[:, 0]
         train1 = t1[:, 1:]
         params_high = {
             'n_estimators': 1000,
             'max_depth': 10,
             'min_samples_split': 2,
             'learning_rate': 0.01,
             'loss': 'huber'
         }
         # one_model = GBR(**params_high)
         one_model = RFR()
         one_model.fit(train1, test1.T)
         with open('./pkls/' + str(year) + path + '_1.pkl', 'wb') as f:
             pkl.dump(one_model, f)
     if len(t2) > 0:
         np.random.shuffle(t2)
         test2 = t2[:, 0]
         train2 = t2[:, 1:]
         # zero_model = GBR(**params_high)
         zero_model = RFR()
         zero_model.fit(train2, test2.T)
         with open('./pkls/' + str(year) + path + '_0.pkl', 'wb') as f:
             pkl.dump(zero_model, f)
Esempio n. 25
0
def __ensemble_test(type, X_train, X_test, y_train, y_test):
    if type.lower() == 'gbr':
        reg = GBR(n_estimators=100, random_state=1)
    elif type.lower() == 'rfr':
        reg = RFR(n_estimators=100, random_state=1)
    elif type.lower() == 'abr':
        reg = ABR(n_estimators=100, random_state=1)
    elif type.lower() == 'etr':
        reg = ETR(n_estimators=100, random_state=1)
    reg.fit(X_train, y_train)
    return reg, reg.score(X_test, y_test), reg.feature_importances_
Esempio n. 26
0
def rand_for(df):
    df2 = df.filter(items=[
                          'price', 'security_deposit', 'accomodates',
                           'bedrooms', 'bathrooms', 'property_type',
                           'room_type', 'latitude', 'longitude',
                           'housing_type','price_bin','amount'
                           'areas','Complement_of_Availability_Next_90_Days','cleaning_fee'])
    print(df2.head(),len(df2))
    df3 = pd.DataFrame()
    # Random Forest ************************************************************************************************
    # for i in df2.price_bin.unique():
    #     t = df2[df2.price_bin==i]
    t = df2
    t =t.fillna(0)
    t = pd.get_dummies(t)
    print(t.columns,len(t))
    y = t.pop('price').values
    X = t.values
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    rf = RFR(n_estimators=500)
    mod = rf.fit(X_train, y_train,)
    rf_rmse = '%.2f'%np.sqrt(mse(y_test,rf.predict(X_test)))
    print('rmse:',rf_rmse)
    rf_score = '%.3f'%rf.score(X_test, y_test)
    print("Random Forest score:", rf_score)
    imp =  (rf.feature_importances_)
    ord = np.argsort(rf.feature_importances_)[::-1]
    _cols = t.columns.tolist()
    imp_cols = ord[:6]
    # feats = _cols[imp_cols]
    feats = []
    for i in range(len(imp_cols)):
        for j in _cols:
            if _cols.index(j) == imp_cols[i]:
                feats.append(j)
    print(feats)
    x = sorted(imp,reverse=True)[:6]
    imp_feats = {}
    for i in range(len(feats)):
        imp_feats.update({feats[i]:'%.4f'%x[i]})
    print(imp_feats)
    # breakpoint()
    tempdf = pd.DataFrame.from_dict(imp_feats,orient='index').T

    df3 = df3.append(tempdf,sort=True)
    print(df3)
# df3.to_csv('feature_importance_table.csv')
    # x = np.array(df.columns.tolist())[idx]
    # y = np.array(x)[idx]
    #     model = sm.OLS(y_train, X_train)
    #     results = model.fit()
    #     model.predict(X_test,y_test)
    #     print(results.summary())
    return (imp_cols,_cols,imp,imp_feats,rf_rmse,rf_score)
Esempio n. 27
0
    def random_forest(self):

        print('Random_Forest')

        rg = RFR(n_jobs=-1, n_estimators=100, random_state=100)
        rg.fit(self.X_train, self.y_train)

        importances = pd.DataFrame({'RF': rg.feature_importances_},
                                   index=self.X_train.columns)
        importances = _norm(importances)

        return rg, importances
Esempio n. 28
0
    def predict(self):
        regr_rf = RFR(max_depth=17, random_state=9, n_estimators=50, n_jobs=-1)
        regr_rf.fit(self.x_train, self.y_train)
        train_result = regr_rf.predict(self.x_train)
        test_result = regr_rf.predict(self.x_test)

        export_filename = 'RandomForestReg'
        if self.drop_feature_names:
            export_filename += '_without_' + '_'.join(self.drop_feature_names)

        BaseModel.export_prediction(test_result, export_filename)
        return (train_result, test_result)
Esempio n. 29
0
def fill_missing(df):
    all_df = df.iloc[:,[4,0,1,2,3,5,6,7,8]]    #第4列表示月收入,去除家属数量
    # df.head()
    known = all_df[all_df.月收入.notnull()].as_matrix()
    unknown = all_df[all_df.月收入.isnull()].as_matrix()
    X = known[:,1:]
    Y = known[:,0]
    rfr = RFR(random_state=0,n_estimators=200,max_depth=3)
    rfr.fit(X,Y)
    predict = rfr.predict(unknown[:,1:]).round(0)
    df.loc[(df.月收入.isnull()),'月收入'] = predict
    return df
Esempio n. 30
0
def rfrcv(n_estimators, min_samples_split, max_features, max_depth):
    return cross_val_score(RFR(n_estimators=int(n_estimators),
                               min_samples_split=int(min_samples_split),
                               max_features=min(max_features, 0.999),
                               max_depth=int(max_depth),
                               random_state=2016,
                               n_jobs=6),
                           X,
                           y,
                           scoring=score,
                           n_jobs=3,
                           cv=3).mean()