def random_forest(train_set, test_set): ''' Creates, trains and tests a random forest regressor, then writes results to terminal. Params: train_set: A list with training data. test_set: A list with test data. ''' clf_energy = RFR(n_jobs=2, n_estimators=10) clf_happiness = RFR(n_jobs=2, n_estimators=10) # Fit the regressor models on the spotify statistics, fit them on the # mood stats determined by another research project. clf_energy.fit([row[3:16] for row in train_set], [row[1] for row in train_set]) clf_happiness.fit([row[3:16] for row in train_set], [row[2] for row in train_set]) result_energy = clf_energy.predict([row[3:16] for row in test_set]) result_happiness = clf_happiness.predict([row[3:16] for row in test_set]) # Compute the total absolute difference between the predicted and actual # moods. energy_mean = 0.0 happiness_mean = 0.0 for i in range(len(test_set)): energy_mean += abs(float(result_energy[i]) - float(test_set[i][1])) happiness_mean += abs(float(result_happiness[i]) - float(test_set[i][2])) energy_mean /= len(test_set) happiness_mean /= len(test_set) print("Avg discrepancy - Energy: " + str(energy_mean)) print("Avg discrepancy - Happiness: " + str(happiness_mean))
def rfr_fillna(df_all): ''' func:对于原来的表格进行缺失值填充,使用的方法是随机森林 paramas: df_all:原来需要填充的表格 return:df_adda(新的表格),model(填充模型),MinMax_1st(归一化模型1),MinMax_2nd(归一化模型2) ''' # 将数据分段,选择好要进行预测的因变量和自变量 user_id = df_all.iloc[:, 0] X = df_all.iloc[:, 1:-1] Y = df_all.iloc[:, -1] X1 = X.copy() Y2 = X1.iloc[:, 43:] sex = X1.iloc[:, 0] X2 = X1.iloc[:, 1:43] # 量纲归一化 MinMax_1st = MinMaxScaler().fit(X2) X2.iloc[:, :] = MinMax_1st.transform(X2) X2 = pd.concat([sex, X2], axis=1) # 对于模型进行筛选 model = {} krange = range(4, 30) for k in tqdm(list(Y2)): X_train = X2[Y2[k].notnull()] X_test = X2[Y2[k].isnull()] Y_train = Y2[k][Y2[k].notnull()] score = [] for i in krange: rfr = RFR(min_samples_split=i, n_jobs=-1) score_each = cvs(rfr, X_train, Y_train, cv=3, n_jobs=-1).mean() score.append(score_each) best_choose = list(krange)[np.argmax(score)] rfr = RFR(min_samples_split=best_choose, n_jobs=-1) rfr = rfr.fit(X_train, Y_train) model[k] = rfr Y2[k][Y2[k].isnull()] = rfr.predict(X_test) # 对银行流水表再次量纲归一化 MinMax_2nd = MinMaxScaler().fit(Y2) Y2.iloc[:, :] = MinMax_2nd.transform(Y2) df_adda = pd.concat([X2, Y2], axis=1) df_adda = pd.concat([user_id, df_adda, Y], axis=1) return df_adda, model, MinMax_1st, MinMax_2nd
def na_rf_interp(self, my_data, na_variables, features='all', rf_params=None): if not rf_params: rf = RFR() else: rf = RFR(rf_params) if type(my_data).__name__ == 'dict': my_data = pd.DataFrame.from_dict(my_data) # parse features if type(features ).__name__ == 'str' or type(features).__name__ != 'dict': features_ = {} for t in na_variables: if features == 'all': features_[t] = [ x for x in my_data if not x in na_variables ] elif type(features).__name__ == 'str': features_[t] = [features] else: features_[t] = features features = features_ my_rfs = {} for f in na_variables: rf_ = rf # nans id_na = np.isnan(my_data[f]) if id_na.sum() in [0, my_data[f].size]: continue # nothing to interpolate rf_.fit( my_data.filter(features[f])[(id_na - 1).astype(bool)].values, my_data[f][(id_na - 1).astype(bool)].values, ) my_data[f][id_na] = rf_.predict( my_data.filter(features[f])[id_na].values) my_rfs[f] = rf_ rf_ = None return my_data, my_rfs
def setup_random_forest(self): n_estimators = [ int(x) for x in np.linspace(start=20, stop=1000, num=10) ] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid rf = RFR() random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } return RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=3, verbose=0, random_state=42, n_jobs=12)
def _fit(self, img, dot, tags, boxConstraints=[]): numFeatures = img.shape[1] if self._method == "RandomForest": from sklearn.ensemble import RandomForestRegressor as RFR regressor = RFR(n_estimators=self._ntrees, max_depth=self._maxdepth) regressor.fit(img, dot) elif self._method == "svrBoxed-gurobi": regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon) regressor.fit( img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures)) #elif self._method == "svrBoxed-gurobi": # regressor = RegressorGurobi(C = self._C, epsilon = self._epsilon) # regressor.fit(img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures # )) elif self._method == "BoxedRegressionGurobi": regressor = RegressorC(C=self._C, epsilon=self._epsilon) regressor.fitgurobi(img, dot, tags, boxConstraints) elif self._method == "BoxedRegressionCplex": regressor = RegressorC(C=self._C, epsilon=self._epsilon) regressor.fitcplex(img, dot, tags, boxConstraints) return regressor
def get_new_model(self): if (self.model_type.split("_")[-1] == "Regressor"): if (self.model_type == "Linear-Regressor"): from sklearn.linear_model import LinearRegression self.model = LinearRegression(**self.model_args) elif (self.model_type == "Support-Vector-Regressor"): import sklearn.svm as SVR self.model = SVR(**self.model_args) elif (self.model_type == "Decision-Tree-Regressor"): from sklearn.tree import DecisionTreeRegressor as DTR self.model = DTR(**self.model_args) elif (self.model_type == "Random-Forest-Regressor"): from sklearn.ensemble import RandomForestRegressor as RFR self.model = RFR(**self.model_args) else: if (self.model_type == "Logistic-Regression-Classifier"): from sklearn.linear_model import LogisticRegression self.model = LogisticRegression(**self.model_args) elif (self.model_type == "KNN-Classifier"): from sklearn.neighbors import KNeighborsClassifier as KNN self.model = KNN(**self.model_args) elif (self.model_type == "Support-Vector-Classifier"): import sklearn.svm as SVC self.model = SVC(**self.model_args) elif (self.model_type == "Naive-Bayes-Classifier"): from sklearn.naive_bayes import GNB self.model = GNB(**self.model_args) elif (self.model_type == "Decision-Tree-Classifier"): from sklearn.tree import DecisionTreeClassifier as DTC self.model = DTC(**self.model_args) elif (self.model_type == "Random-Forest-Classifier"): from sklearn.ensemble import RandomForestClassifier as RFC self.model = RFC(**self.model_args)
def make_prediction(response, features, tr, ts): model = RFR(n_estimators=50, n_jobs=11) model.fit(features.loc[tr, :], response.loc[tr, 'RESPONSE']) results = response.loc[ts, :].copy() y_pr = model.predict(features.loc[ts, :]) results['Predicted'] = y_pr return results
def get_feat_imps(): X_train, X_test, y_train, y_test = data_for_gridsearch() column_names = X_train.columns model = RFR(max_features='auto', max_depth=None, bootstrap=True, min_samples_leaf=5, min_samples_split=10, n_estimators=100) model = model.fit(X_train, y_train) model_params = model.get_params() feat_imps = model.feature_importances_ print('model_params', model_params) print('feat_imps', feat_imps) rmse_train, rmse_test, errors_for_plot = eval_model( model, X_train, y_train, X_test, y_test) print('RMSE train/test: ', rmse_train, rmse_test) return model_params, feat_imps, column_names
def fit_state(self, X, y_data, y_state): self.clf_free = RFR(n_estimators=self.n_estimators, criterion=self.criterion) self.clf_queue = RFR(n_estimators=self.n_estimators, criterion=self.criterion) f_indices = y_state == 0 q_indices = y_state == 1 X_f = X[f_indices] y_f = y_data[f_indices] self.clf_free.fit(X_f, y_f) X_q = X[q_indices] y_q = y_data[q_indices] self.clf_queue.fit(X_q, y_q)
def pcpower_pred_train(df_list, power_df, time_unit): X_np, y_np = pred_preprocess(df_list, power_df, time_unit, train=1) #print(list(X_np.max(axis=0))) #print(list(X_np.min(axis=0))) #return 0 minmax_scaler = MinMaxScaler() minmax_scaler.fit(minmax_list) X_minmax = minmax_scaler.transform(X_np) #print(X_minmax) nrmse_best = 1000 ssplit = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) for train_index, test_index in ssplit.split(X_minmax, y_np): #model = Lasso(alpha=param_best) model = RFR() X_train, X_test = X_minmax[train_index, :], X_minmax[test_index, :] y_train, y_test = y_np[train_index], y_np[test_index] model.fit(X_train, y_train) y_pred = model.predict(X_test) nrmse_tmp = nrmse(y_test, y_pred) if nrmse_tmp < nrmse_best: #mfile = open('/home/fc10382/Mcoder/Django/algorithm/dcpower/lasso.pkl', 'wb') if time_unit == 15: mfile = open('dcpower/model/pred_rfr.pkl', 'wb') elif time_unit == 10: mfile = open('dcpower/model/pred_rfr-10.pkl', 'wb') elif time_unit == 5: mfile = open('dcpower/model/pred_rfr-5.pkl', 'wb') pickle.dump(model, mfile) mfile.close() nrmse_best = nrmse_tmp #print(model.coef_, model.intercept_, 'NRMSE:', nrmse_tmp) print(model.feature_importances_, 'NRMSE:', nrmse_tmp)
def lasso_vm2pc_train(df_list, power_df): df_list, power_df = data_preprocess(df_list, power_df) df_sum = vmsum2one(df_list) minmax_scaler = MinMaxScaler() minmax_scaler.fit(minmax_list) X_minmax = minmax_scaler.transform(df_sum) #print(X_minmax) # X_minmax = df_sum.values y_np = power_df.values #param_best = gridsearch_lasso_best(X_minmax, y_np)['alpha'] nrmse_best = 100 ssplit = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) for train_index, test_index in ssplit.split(X_minmax, y_np): #model = Lasso(alpha=param_best) model = RFR() X_train, X_test = X_minmax[train_index, :], X_minmax[test_index, :] y_train, y_test = y_np[train_index], y_np[test_index] model.fit(X_train, y_train) y_pred = model.predict(X_test) nrmse_tmp = nrmse(y_test, y_pred) if nrmse_tmp < nrmse_best: #mfile = open('/home/fc10382/Mcoder/Django/algorithm/dcpower/lmodel/asso.pkl', 'wb') mfile = open('dcpower/model/rfr.pkl', 'wb') pickle.dump(model, mfile) mfile.close() #print(model.coef_, model.intercept_, 'NRMSE:', nrmse_tmp) print(model.feature_importances_, 'NRMSE:', nrmse_tmp)
def grid_search( data, estimator=RFR(n_estimators=40), param_grid={ "max_depth": [2, 5, 10, 15], "min_samples_split": [20, 30, 40], "max_features": ['auto', 'sqrt', 'log2'] }, cv=5): """ Build a model of type estimator with paramters prescribed by cross validated grid search. After cross validation, best estimator is built on parameter combination and trained on entire training set. Returns both production ready model and grid search object :param Data data: data object, requires (train/test)(Design/Target) attributes :py:class:`pandas.DataFrame` :param classifier/estimator estimator: base estimator to grid search :py:class:`sklearn.GridSearchCV` :param dict param_grid: paramter grid to search in grid search :param int cv: number of folds for cross validation :return: model grid data :rtype: tuple.(estimator, GridSearchCV, Data) """ grid = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv) grid.fit(data.trainDesign, data.trainTarget) model = grid.best_estimator_.fit( pd.concat([data.trainDesign, data.testDesign]), pd.concat([data.trainTarget, data.testTarget])) return model, grid, data
def __init__(self, init_states=None, init_errors=None, params_file=None): if params_file is None: n_estimators = [ int(x) for x in np.linspace(start=20, stop=1000, num=10) ] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num=11)] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid rf = RFR() random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } self.rf_random = RandomizedSearchCV( estimator=rf, param_distributions=random_grid, n_iter=12, cv=3, verbose=1, random_state=42, n_jobs=12) # Fit the random search model if init_states is not None: self.train(init_states, init_errors)
def fit(data): print('loading dataset {}...'.format(data)) X = np.load('../data/desc_{}.npy'.format(data)) y = np.load('../data/labels_{}.npy'.format(data)) print('scaling...') X = scale_descriptors(X) print('stripping....') X = strip_harmonics(X, n_h=30) print('separating...') X = sep_re_im(X) X_train, X_test, y_train, y_test = train_test_split(X, y) print('shape of training data') print(X_train.shape) print('fitting model...') rfr = RFR(n_estimators=100, oob_score=True) rfr.fit(X_train, y_train) preds_train = rfr.predict(X_train) preds_test = rfr.predict(X_test) print('oob score') print(rfr.oob_score_) print('train and test scores') print(r2_score(y_train, preds_train)) print(r2_score(y_test, preds_test)) dump(rfr, 'rfr_{}.joblib'.format(data))
def init_data(): # データの読み込み dataset = pd.read_excel('basutienn.xlsx') # 教師データとテストデータに分割 train_data, test_data, train_target, test_target = train_test_split( dataset.iloc[:, 1:5], dataset.iloc[:, 5], test_size=0.3, random_state=0) rg = RFR(n_jobs=1, random_state=0, n_estimators=5) # randomforest rg.fit(train_data, train_target) pred = rg.predict(test_data) # 学習済みモデルの保存 joblib.dump(rg, "rf.pkl", compress=True) # 予測精度 print("result: ", rg.score(test_data, test_target)) # データの保存 data = dataset.iloc[:, 1:5].values target = dataset.iloc[:, 5].values np.save("data", data) np.save("target", target)
def grid_search(X, y): ''' cross validated grid search using Ridge Regressor and Random Forest Regressor ''' nids = df_subset.index titles = df_subset['title'] pars = { 'alpha': [ 0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02 ] } gs = GridSearchCV(Ridge(), pars, cv=5) gs.fit(X, y) ridge = gs.best_estimator_ dill.dump(ridge, open('ridge.pkl', 'wb')) pars = { 'max_depth': [5, 8, 10, 20, 50, 100], 'min_samples_split': [2, 3, 5, 10, 20] } gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2), pars, cv=5) rfr = gs.best_estimator_ dill.dump(rfr, open('rfr.pkl', 'wb')) return ridge, rfr
def _fit(self, image, dot, tags, boxConstraints=[]): img = self.normalize(image) if type(boxConstraints) is dict: boxConstraints["boxFeatures"] = self.normalize( boxConstraints["boxFeatures"]) numFeatures = img.shape[1] if self._method == "RandomForest": from sklearn.ensemble import RandomForestRegressor as RFR regressor = RFR(n_estimators=self._ntrees, max_depth=self._maxdepth) regressor.fit(img, dot) elif self._method == "svrBoxed-gurobi": regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon) regressor.fit( img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures)) elif self._method == "svrBoxed-gurobi": regressor = RegressorGurobi(C=self._C, epsilon=self._epsilon) regressor.fit( img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures)) return regressor
def __init__(self): # self._vectorizer = TfidfVectorizer(stop_words='english') self._regressor = RFR(max_features='sqrt', max_depth=100, bootstrap=False, min_samples_leaf=1, min_samples_split=2, n_estimators=200)
def rfr_cv(n_estimators, max_features, data, targets): # using https://github.com/fmfn/BayesianOptimization estimator = RFR( n_estimators=n_estimators, max_features=max_features, ) cval = cross_val_score(estimator, data, targets, scoring='r2', cv=4) return cval.mean()
class Algorithms(Enum): RandomForestRegressor = RFR() MLPRegressor = MLPR() KNeighborsRegressor = KNR() Ridge = RR() Lasso = LR() def __str__(self): return self.name
def model(X_train, y_train, X_test=np.array([]), y_test=np.array([]), method="LR"): #X_train inputs of model for training #X_test inputs of model fortesting #y_train -outputs for Xtrain #y_test - outputs fo X_test #method of model design. Default method is linear regression if method == "LR": lr = LR() elif method == "Ridge": lr = Ridge() elif method == "Lasso": lr = Lasso() elif method == "MLPRegressor": lr = MLPRegressor() elif method == "SVR": lr = SVR() elif method == "KNR": lr = KNR() elif method == "RFR": lr = RFR() elif method == "GBR": lr = GBR() else: print("unknown method") return False # lr = MLPRegressor( hidden_layer_sizes=[5], activation ="relu") # lr = MLPRegressor() # lr=SVR() # lr=KNR() # # lr=Ridge(alpha=alpha.x) # lr=Ridge() # lr=Lasso(alpha=0.001) # lr=Lasso() # lr=RFR(n_estimators=5, max_features=2, max_depth=2, random_state=2) # lr=RFR() # lr=GBR() lr = lr.fit(X_train, y_train[:, 0]) y_mod_train = lr.predict(X_train) c_train = CCC(y_train, y_mod_train[:, np.newaxis]) c_test = -1 if len(y_test) > 0: y_mod_test = lr.predict(X_test) c_test = CCC(y_test, y_mod_test[:, np.newaxis]) return (lr, c_train, c_test)
def rfrcv(n_estimators, min_samples_split, max_features): val = cross_val_score(RFR(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split), max_features=min(max_features, 0.999), random_state=42), X_train, y_train, cv=2).mean() return val
def __init__(self, n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None): self.max_samples = max_samples self.max_leaf_nodes = max_leaf_nodes self.max_features = max_features self.bootstrap = bootstrap self.min_samples_split = min_samples_split self.random_state = random_state self.min_samples_leaf = min_samples_leaf self.ccp_alpha = ccp_alpha self.min_impurity_decrease = min_impurity_decrease self.criterion = criterion self.n_jobs = n_jobs self.max_depth = max_depth self.warm_start = warm_start self.oob_score = oob_score self.verbose = verbose self.n_estimators = n_estimators self.min_weight_fraction_leaf = min_weight_fraction_leaf self.min_impurity_split = min_impurity_split self.model = RFR( ccp_alpha=self.ccp_alpha, bootstrap=self.bootstrap, min_impurity_decrease=self.min_impurity_decrease, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_split=self.min_impurity_split, max_depth=self.max_depth, min_samples_split=self.min_samples_split, max_leaf_nodes=self.max_leaf_nodes, n_estimators=self.n_estimators, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, oob_score=self.oob_score, max_samples=self.max_samples, verbose=self.verbose, warm_start=self.warm_start, n_jobs=self.n_jobs, criterion=self.criterion, random_state=self.random_state)
def split(model, data, t1, t2, path): for k in data: # print(len(data[k])) for row in data[k]: if model.predict(row[1:].reshape(1, -1)): # print(row[1:].reshape(1, -1)) if len(t1) > 0: # print(t1.shape) t1 = np.r_[t1, np.array(row).reshape(1, -1)] else: t1 = np.array(row).reshape(1, -1) else: if len(t2) > 0: # print(t2.shape) t2 = np.r_[t2, np.array(row).reshape(1, -1)] else: t2 = np.array(row).reshape(1, -1) print(len(t1) + len(t2)) if len(t1) > 0: np.random.shuffle(t1) test1 = t1[:, 0] train1 = t1[:, 1:] params_high = { 'n_estimators': 1000, 'max_depth': 10, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'huber' } # one_model = GBR(**params_high) one_model = RFR() one_model.fit(train1, test1.T) with open('./pkls/' + str(year) + path + '_1.pkl', 'wb') as f: pkl.dump(one_model, f) if len(t2) > 0: np.random.shuffle(t2) test2 = t2[:, 0] train2 = t2[:, 1:] # zero_model = GBR(**params_high) zero_model = RFR() zero_model.fit(train2, test2.T) with open('./pkls/' + str(year) + path + '_0.pkl', 'wb') as f: pkl.dump(zero_model, f)
def __ensemble_test(type, X_train, X_test, y_train, y_test): if type.lower() == 'gbr': reg = GBR(n_estimators=100, random_state=1) elif type.lower() == 'rfr': reg = RFR(n_estimators=100, random_state=1) elif type.lower() == 'abr': reg = ABR(n_estimators=100, random_state=1) elif type.lower() == 'etr': reg = ETR(n_estimators=100, random_state=1) reg.fit(X_train, y_train) return reg, reg.score(X_test, y_test), reg.feature_importances_
def rand_for(df): df2 = df.filter(items=[ 'price', 'security_deposit', 'accomodates', 'bedrooms', 'bathrooms', 'property_type', 'room_type', 'latitude', 'longitude', 'housing_type','price_bin','amount' 'areas','Complement_of_Availability_Next_90_Days','cleaning_fee']) print(df2.head(),len(df2)) df3 = pd.DataFrame() # Random Forest ************************************************************************************************ # for i in df2.price_bin.unique(): # t = df2[df2.price_bin==i] t = df2 t =t.fillna(0) t = pd.get_dummies(t) print(t.columns,len(t)) y = t.pop('price').values X = t.values X_train, X_test, y_train, y_test = train_test_split(X, y) rf = RFR(n_estimators=500) mod = rf.fit(X_train, y_train,) rf_rmse = '%.2f'%np.sqrt(mse(y_test,rf.predict(X_test))) print('rmse:',rf_rmse) rf_score = '%.3f'%rf.score(X_test, y_test) print("Random Forest score:", rf_score) imp = (rf.feature_importances_) ord = np.argsort(rf.feature_importances_)[::-1] _cols = t.columns.tolist() imp_cols = ord[:6] # feats = _cols[imp_cols] feats = [] for i in range(len(imp_cols)): for j in _cols: if _cols.index(j) == imp_cols[i]: feats.append(j) print(feats) x = sorted(imp,reverse=True)[:6] imp_feats = {} for i in range(len(feats)): imp_feats.update({feats[i]:'%.4f'%x[i]}) print(imp_feats) # breakpoint() tempdf = pd.DataFrame.from_dict(imp_feats,orient='index').T df3 = df3.append(tempdf,sort=True) print(df3) # df3.to_csv('feature_importance_table.csv') # x = np.array(df.columns.tolist())[idx] # y = np.array(x)[idx] # model = sm.OLS(y_train, X_train) # results = model.fit() # model.predict(X_test,y_test) # print(results.summary()) return (imp_cols,_cols,imp,imp_feats,rf_rmse,rf_score)
def random_forest(self): print('Random_Forest') rg = RFR(n_jobs=-1, n_estimators=100, random_state=100) rg.fit(self.X_train, self.y_train) importances = pd.DataFrame({'RF': rg.feature_importances_}, index=self.X_train.columns) importances = _norm(importances) return rg, importances
def predict(self): regr_rf = RFR(max_depth=17, random_state=9, n_estimators=50, n_jobs=-1) regr_rf.fit(self.x_train, self.y_train) train_result = regr_rf.predict(self.x_train) test_result = regr_rf.predict(self.x_test) export_filename = 'RandomForestReg' if self.drop_feature_names: export_filename += '_without_' + '_'.join(self.drop_feature_names) BaseModel.export_prediction(test_result, export_filename) return (train_result, test_result)
def fill_missing(df): all_df = df.iloc[:,[4,0,1,2,3,5,6,7,8]] #第4列表示月收入,去除家属数量 # df.head() known = all_df[all_df.月收入.notnull()].as_matrix() unknown = all_df[all_df.月收入.isnull()].as_matrix() X = known[:,1:] Y = known[:,0] rfr = RFR(random_state=0,n_estimators=200,max_depth=3) rfr.fit(X,Y) predict = rfr.predict(unknown[:,1:]).round(0) df.loc[(df.月收入.isnull()),'月收入'] = predict return df
def rfrcv(n_estimators, min_samples_split, max_features, max_depth): return cross_val_score(RFR(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split), max_features=min(max_features, 0.999), max_depth=int(max_depth), random_state=2016, n_jobs=6), X, y, scoring=score, n_jobs=3, cv=3).mean()