def return_best_rf_regressor(df, target, num_trees_hyperparameter, num_trees_final_clf, num_iterations): print "entering return best rf regressor function" if df.shape[0] < 10000: num_samples = df.shape[0] else: num_samples = int(df.shape[0]*0.7) print "Sample dataframe" #use X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples) # figure out a vary this some how """ param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True, False], "criterion": ["gini", "entropy"]} """ param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True]} clf = RandomForestRegressor(n_estimators=num_trees_hyperparameter) print "starting hyperparameter search" clf_best, best_params = hyperparameter_search_random(X, y, clf, param_dist, num_iterations) print "sample data for fitting model" #train new classifier on the entire dataset X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples=df.shape[0]) clf_final = RandomForestRegressor(n_estimators=num_trees_final_clf, max_depth = best_params["max_depth"], min_samples_leaf = best_params["min_samples_leaf"], min_samples_split = best_params["min_samples_split"], bootstrap = best_params["bootstrap"], max_features = best_params["max_features"]) print "Fitting Random Forest Regressor" clf_final.fit(X,y) return clf_final, column_list_for_sampled
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None): """ Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ try: df = pd.read_csv(coordinationDir + element + '.csv') except Exception: print 'No data for ' + element return None, None, None df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] if(len(df) < 4): print 'Not enough data for ' + element return None, None, None s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df['avgCoordination'].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def set_missing_ages(df): # 把已有的数值型特征取出来丢进Random Forest Regressor中 age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']] # 乘客分成已知年龄和未知年龄两部分 known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() # y即目标年龄 y = known_age[:, 0] # X即特征属性值 X = known_age[:, 1:] # fit到RandomForestRegressor之中 rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(X, y) # 用得到的模型进行未知年龄结果预测 predictedAges = rfr.predict(unknown_age[:, 1::]) # 用得到的预测结果填补原缺失数据 df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges return df, rfr
def main(): fi = open('25-75_microcap_list.txt', 'r') symbols = [] for i in fi: symbols.append(i.strip()) #symbols = symbols[0:6] train, test = get_data(symbols, n = 30, flag = 1, blag = 12) train = train.replace([np.inf, -np.inf], np.nan) test = test.replace([np.inf, -np.inf], np.nan) train = train.dropna(axis=0) test = test.dropna(axis=0) print 'Fitting\n' m = RandomForestRegressor(n_estimators=250, n_jobs=1) m.fit(train.ix[:,6:], train.ix[:,5]) print 'Predicting\n' preds = m.predict(test.ix[:,5:]) result = test.ix[:,:4] result['Prediction'] = preds result = result.sort('Prediction', ascending=False) print result.head() result.to_csv('trade_result.csv', sep = ',', index = False)
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'c_a', md = None): """ Build a random forest-regressor model to predict some structure feature from compositional data. Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values """ df = pd.read_csv(structurestable) df = df.dropna() if('fracNobleGas' in df.columns): df = df[df['fracNobleGas'] <= 0] s = StandardScaler() X = s.fit_transform(df[predictorColumns].astype('float64')) y = df[targetcolumn].values rfr = RandomForestRegressor(max_depth = md) acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error))) X_train, X_test, y_train, y_test = train_test_split(X,y) rfr.fit(X_train,y_train) y_predict = rfr.predict(X_test) t = pd.DataFrame({'True':y_test, 'Predicted':y_predict}) rfr.fit(X, y) return rfr, t, round(acc,2)
def get_preds(features, trees=3000, depth=19): # features is the number of latents features that I want the nmf to run on # Create dataframes df = get_nmf(k=features) df_full = add_yahoo_to_df(df) df_train = add_dummies(df_full) # Why aren't you using df_full? df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is df_test_full = add_yahoo_to_df(df_test) df_test_full = add_dummies(df_test_full) # Create models X_model_class, y_model_class = get_classifier_data(df_full) rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth) rf_class.fit(X_model_class, y_model_class) # X_model_regress, y_model_regress = get_regressor_data(df_full) rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth) rf_regress.fit(X_model_regress, y_model_regress) # Get X and y values X_classify, y_classify = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) # Run models classifier_preds = rf_class.predict(X_classify) classifier_accuracy = accuracy_score(classifier_preds, y_classify) regressor_preds = rf_regress.predict(X_regress) regressor_mse = mean_squared_error(regressor_preds, y_regress) # I want to return the number of features, k, along with the accuracy of the classifier # and the MSE of the regressor. This will give me an idea of how well things are doing # based on the number of features. return [features, classifier_accuracy, regressor_mse]
def train_sklearn_forest(XAlltr, XAllcv, yAlltr, yAllcv, trees=20): errors = [] models = [] X = XAlltr Xcv = XAllcv print "training sklearn forset" for feature in range(np.shape(yAlltr)[1]): y = yAlltr[:, feature] ycv = yAllcv[:, feature] # train a random forest with different number of trees and plot error # print "training forest %d" % trees clf = RandomForestRegressor(n_estimators=trees, min_samples_leaf=30, max_depth=20) clf = RandomForestRegressor(n_estimators=trees) clf.fit(X, y) pred = clf.predict(X) err = pred_error(y, pred, feature) predcv = clf.predict(Xcv) errcv = pred_error(ycv, predcv, feature) print [trees, feature, err, errcv] errors.append((trees, feature, err, errcv)) models.append(clf) return models, errors
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def pipeline(): val = data[data.watch==1] val_a_b = val[['item_id','store_code','a','b']] val_y = val.label val_x = val.drop(['label','watch','item_id','store_code','a','b'],axis=1) train = data[(data.watch!=1)&(data.watch!=0)] train_y = train.label a = list(train.a) b = list(train.b) train_weight = [] for i in range(len(a)): train_weight.append(min(a[i],b[i])) train_weight = np.array(train_weight) train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1) train_x.fillna(train_x.median(),inplace=True) val_x.fillna(val_x.median(),inplace=True) model = RandomForestRegressor(n_estimators=500,max_depth=5,max_features=0.6,n_jobs=-1,random_state=1024) #train model.fit(train_x,train_y, sample_weight=train_weight) #predict val set val_a_b['pred'] = model.predict(val_x) val_a_b['y'] = val_y cost = cal_cost(val_y.values,val_a_b.pred.values,val_a_b.a.values,val_a_b.b.values) val_a_b.to_csv('val_{0}.csv'.format(cost[1]),index=None)
def stepwise_best_features_per_cluster(X, Y, all_feature_metadata): best_features_per_cluster = {} for c in sorted(X['cluster'].unique()): seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c].ALSFRS_slope print "cluster:", c, "with size:", seg_X.shape, "with mean target:", seg_Y.mean(), "std:", seg_Y.std() seg_Y = seg_Y.fillna(seg_Y.mean()) model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000) #model = LassoCV(cv=5) model = model.fit(seg_X, seg_Y) print "best we can do with all features:", np.sqrt(np.mean((model.predict(seg_X) - seg_Y) ** 2)) print "using model:", model selected_fams = set() selected_derived = set() for i in range(6): score_per_family = {} t1 = time.time() for family, fm in all_feature_metadata.iteritems(): if family not in selected_fams: X_feature_fam = seg_X[list(selected_derived) + list(fm["derived_features"])] model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000) #model = LassoCV(cv=5) model = model.fit(X_feature_fam, seg_Y) score_per_family[family] = np.sqrt(np.mean((model.predict(X_feature_fam) - seg_Y) ** 2)) t_lasso_cv = time.time() - t1 best_fam = sorted(score_per_family.items(), key=operator.itemgetter(1))[0] print "adding best family:", best_fam, "time:", t_lasso_cv selected_fams.add(best_fam[0]) selected_derived.update(all_feature_metadata[best_fam[0]]["derived_features"]) best_features_per_cluster[c] = list(selected_fams) return best_features_per_cluster
def do_regression(df, j, i, k): # input is a pandas dataframe with columns as needed below # output is a regression object trained to the data in the input dataframe # convert dataframe info into a vector y = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'count' ].astype(int).values x_1 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'humidity' ].astype(int).values x_2 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'temp' ].astype(int).values x = zip(x_1, x_2) ## Create linear regression object #regr = linear_model.LinearRegression() # create random forest object, should include all parameters regr = RandomForestRegressor(n_estimators= 100) #forest = DecisionTreeRegressor(max_depth = 4) ## Train the model using the training sets regr.fit(x, y) return regr
def fill_missing_age(df): #把已有的数值型特征取出来丢进Random Forest Regressor 中 age_df = df[['Age','Fare','Parch','SibSp','Pclass']] #print age_df #把乘客分成已知年龄和未知年龄两部分 known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() # print "known_age......." # print known_age # print "unknown age ........" # print unknown_age # 目标年龄 y=known_age[:,0] # 特征属性值 x=known_age[:,1:] #fit 到RandomForestRegressor之中 RFR=RandomForestRegressor(random_state=0,n_estimators=2000,n_jobs=-1) RFR.fit(x,y) #用得到的模型进行未知年龄结果预测 predictedAge= RFR.predict(unknown_age[:,1::]) #用预测的结果填补原缺失数据 df.loc[(df.Age.isnull()),'Age']=predictedAge return df,RFR
def RFscore_one(x,y,id): folds=3 print "RFscore " + id r = range(len(x)) np.random.shuffle(r) x = x[r] y = y[r] x = (x - np.mean(x)) / np.std(x) y = (y - np.mean(y)) / np.std(y) x = np.array(x, ndmin=2) y = np.array(y, ndmin=2) x = x.T y = y.T rf = RandomForestRegressor(n_estimators=50, verbose=0,n_jobs=1,min_samples_split=10,compute_importances=True,random_state=1) fit = rf.fit(x,y) s = fit.score(x,y) cv = cross_validation.KFold(len(x), n_folds=folds, indices=False) score = 0 median = dist(y) for traincv, testcv in cv: fit = rf.fit(x[traincv], y[traincv]) score += fit.score(x[testcv], y[testcv]) score /= folds score /= median return score
def cross_val(seq, ft): n_folds = 10 X, y = load_train_data(seq, ft) print('%d-fold cross validation. Dataset: %d samples, %d features' % (n_folds, X.shape[0], X.shape[1])) kf = KFold(len(y), n_folds=n_folds) n_est = range(30, 110, 20) results = [] for n_estimators in n_est: scores = [] for i, (train, test) in enumerate(kf): rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=mp.cpu_count()) # the (default) score for each regression tree in the ensemble is regression # r2 determination coefficient (e.g., how much variance in y is explained # by the model) # https://www.khanacademy.org/math/probability/regression/regression-correlation/v/r-squared-or-coefficient-of-determination rf.fit(X[train], y[train]) if False: y_pred = rf.predict(X[test]) score = mean_squared_error(y_pred, y[test]) else: score = rf.score(X[test], y[test]) scores.append(score) scores = np.array(scores) print("n_estimators=%d; accuracy (R^2 score): %0.2f (+/- %0.2f)" % (n_estimators, scores.mean(), scores.std() * 2)) results.append([seq, ft, X.shape[0], n_estimators, scores.mean(), scores.std()*2]) return results
def fit(self, X, y, **kwargs): for key, value in kwargs.iteritems(): if key in self.INITPARAMS.keys(): self.INITPARAMS[key] = value model = RandomForestRegressor(**self.INITPARAMS) model.fit(X, y) self.model = model
def regression(X_train, y_train, X_test, y_test): """ Train the regressor from Scikit-Learn. """ # Random forest regressor w/ param optimization params = {'n_estimators':1000, 'criterion':'mse', 'max_depth':20, 'min_samples_split':1, #'estimators':400, depth:20 'min_samples_leaf':1, 'max_features':2, 'bootstrap':True, 'oob_score':False, #'max_features':'log2' 'n_jobs':32, 'random_state':0, 'verbose':0, 'min_density':None, 'max_leaf_nodes':None} if config.DEBUG: params['verbose'] = 1 regr = RandomForestRegressor(**params) # Train the model using the training sets regr.fit(X_train, y_train) return regr # Plot the resutls save_semeval_data.plot_results(regr, params, X_test, y_test, feature_names) if config.DEBUG: # Show the mean squared error print("Residual sum of squares: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % regr.score(X_test, y_test)) return regr
def random_forest(X_train, y_train, y_test, X_test, num_trees=100): model = RandomForestRegressor(n_estimators=num_trees, oob_score=True) model.fit(X_train, y_train) prediction = model.predict(X_test) mean_squared_error = mse(y_test, model.predict(X_test)) r2 = model.score(X_test, y_test) return (mean_squared_error, r2)
class RandomForestModel(Model): """ random forest model """ def __init__(self, *argv, **args): super(RandomForestModel, self).__init__(*argv) self.rf = RandomForestRegressor(**args) def pretreat_feature(self): # pre-handle about the feature data pass def train(self): # train the samples self.rf.fit(self.x, self.y) def assess(self): # assess the regression model error = 0.0 for j in range(len(self.test_x)): pre_val = self.predict(self.test_x[j]) error += (pre_val - self.test_y[j]) ** 2 print 'Training Error: ', error def predict(self, x): # predic the output of the x return self.rf.predict(x) def validate(self): # use cross-validation to choose the best meta-parameter pass
def main(): train = pd.read_csv('../train.csv', parse_dates=['datetime']) train['hour'] = pd.DatetimeIndex(train['datetime']).hour train['weekday'] = pd.DatetimeIndex(train['datetime']).weekday train['isweekend'] = 0 train.loc[(train['weekday']==5) | (train['weekday']==6), 'isweekend'] = 1 test = pd.read_csv('../test.csv', parse_dates=['datetime']) test['hour'] = pd.DatetimeIndex(test['datetime']).hour test['weekday'] = pd.DatetimeIndex(test['datetime']).weekday test['isweekend'] = 0 test.loc[(test['weekday']==5) | (test['weekday']==6), 'isweekend'] = 1 results = pd.DataFrame(columns=['datetime', 'count']) for hour, test_subset in test.groupby(test['hour']): train_subset = train[train['hour'] == hour] model = RandomForestRegressor(n_estimators=100) model.fit(np.array(get_features(train_subset)), np.array(train_subset['count'])) predictions = model.predict(np.array(get_features(test_subset))) dt = test_subset['datetime'] predictions = pd.Series(predictions, index=dt.index) res = pd.concat([dt, predictions], axis=1) res.columns=['datetime', 'count'] results = pd.concat([results, res]) results['count'] = results['count'].astype('int64') results = results.sort('datetime') results.to_csv('../submissions/seventhSubmission.csv', index=False)
def main(): fi = open('45-165caps.txt', 'r') symbols = [] for i in fi: symbols.append(i.strip()) #symbols = symbols[0:6] train, test = build_data(symbols, n = 200, flag = 1, blag = 20) train = train.replace([np.inf, -np.inf], np.nan) test = test.replace([np.inf, -np.inf], np.nan) train = train.dropna(axis=0) test = test.dropna(axis=0) #print train.head().T #print test.head().T print 'Fitting\n' m = RandomForestRegressor(n_estimators=500, n_jobs=10) m.fit(train.ix[:,5:], train.ix[:,4]) print 'Predicting\n' preds = m.predict(test.ix[:,4:]) result = test.ix[:,:4] result['Prediction'] = preds result = result.sort('Prediction', ascending=False) print result.head() result.to_csv('trade_result.csv', sep = ',', index = False)
def randomforest(data, targets, num, fnum): """ 7:1205 """ model = RandomForestRegressor(n_estimators=num, verbose=0, oob_score=True, compute_importances=True, n_jobs=10, criterion="mse", max_features=fnum) model.fit(data, targets) return model
def random_forest_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the code for validation curve :param l_curve: run the code for learning curve :param get_model: run the code :return:the predicted values,learning curve, validation curve """ rf = RandomForestRegressor(n_estimators=20,criterion='mse',max_features='auto', max_depth=10) if get_model: print "Fitting RF..." rf.fit(train_x, np.log(train_y+1)) print rf.score(train_x, np.log(train_y+1)) rf_pred = np.exp(rf.predict(pred_x))-1.0 Votes = rf_pred[:,np.newaxis] Id = np.array(review_id)[:,np.newaxis] submission_rf = np.concatenate((Id,Votes),axis=1) # create submission csv for Kaggle np.savetxt("submission_rf.csv", submission_rf,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') # plot validation and learning curves if v_curve: train_y = np.log(train_y+1.0) plot_validation_curve(RandomForestRegressor(), "Random Forest: Validation Curve(No: of trees)", train_x,train_y,'n_estimators',[5,10,20,50,100]) if l_curve: train_y = np.log(train_y+1.0) plot_learning_curve(RandomForestRegressor(), "Random Forest: Learning Curve", train_x,train_y)
def do_rf(filename): df, Y = create_merged_dataset(filename) rf = RandomForestRegressor(n_estimators=100) X = df.drop(['driver', 'trip'], 1) rf.fit(X, Y) probs = rf.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def refit_from_scratch(self): """ Create a new model directly from the database, rather than rely on the one saved from last time.""" # In the background fit a much larger random forest. self.threaded_fit = ThreadedFit() self.threaded_fit.signal_finished.connect(self.__init__) self.threaded_fit.start() temp_model = RandomForest(max_features="sqrt", n_jobs=-1) temp_enc = CountVectorizer() X = [] # binary matrix the presence of tags Z = [] # additional numerical data Y = [] # target (to predict) values db_size = self.db.size() for data in self.db.yield_some(250): feedback = data["feedback"] tags = data[ "tags" ] if feedback and tags: Y.append( feedback ) X.append(" ".join(tags)) Z.append(self.fmt_numerical(data)) X = temp_enc.fit_transform(X) X = hstack((X, coo_matrix(Z))) self.allX = X pca = PCA(min(X.shape[0], 200)) reduced_X = pca.fit_transform(X.todense()) temp_model.fit(reduced_X, Y) self.pca = pca self.model = temp_model self.enc = temp_enc
def train_with_features(self, features): X = self.data_folder.truncate(self.A, features) rfc = RandomForestRegressor() rfc.fit(X, self.target) return rfc
def round2(X, y): # Set parameters min_score = {} for tree in [50, 100, 200, 500]: for feature in ['auto', 'log2']: model = RandomForestRegressor(n_estimators=tree, max_features=feature) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate root mean squared error for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) scores.append(rmse) if len(min_score) == 0: min_score['estimator'] = tree min_score['max_feature'] = feature min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['estimator'] = tree min_score['max_feature'] = feature min_score['scores'] = scores print "Estimator:", tree print "Max Features:", feature print scores print np.mean(scores) return min_score
def test_rrf_vs_sklearn_reg(self): """Test R vs. sklearn on boston housing dataset. """ from sklearn.datasets import load_boston from sklearn.cross_validation import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=13) n_samples, n_features = X_train.shape mtry = int(np.floor(0.3 * n_features)) # do 100 trees r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0, 'mtry': mtry, 'corr.bias': False, 'sampsize': n_samples, 'random_state': 1234}) r_rf.fit(X_train, y_train) y_pred = r_rf.predict(X_test) r_mse = mean_squared_error(y_test, y_pred) p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False, max_features=mtry, random_state=1) p_rf.fit(X_train, y_train) y_pred = p_rf.predict(X_test) p_mse = mean_squared_error(y_test, y_pred) print('%.4f vs %.4f' % (r_mse, p_mse)) # should be roughly the same (7.6 vs. 7.2) np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
def rf_regressor(self): X = X.toarray() # Convert X from sparse to array X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42) model.fit(X_train, y_train) return model.score(X_test, y_test).round(2)
def train_year(train_fea, trees): values = train_fea['SaleYear'].values years = sorted(list(set(values))) rfs =[] for i in range(0, len(years)): print 'train model %d' % (years[i]) rf = RandomForestRegressor(n_estimators=trees, n_jobs=1, compute_importances = True) y = train_fea[train_fea['SaleYear']==years[i]] y_fea = y.copy() del y_fea['SalePrice'] rf.fit(y_fea, y["SalePrice"]) rfs.append(rf) errors = None for i in range(1, len(years)): pairs = get_pairs(years, i) for p in pairs: print 'compare %d, %d' % (p[0], p[1]) y1 = train_fea[train_fea['SaleYear']==p[0]] y2 = train_fea[train_fea['SaleYear']==p[1]] y1_fea, y2_fea = y1.copy(), y2.copy() del y1_fea['SalePrice'] del y2_fea['SalePrice'] rf = rfs[years.index(p[0])] y2_p = rf.predict(y2_fea) y2_r = np.array([v for v in y2['SalePrice']]) error_rates = np.array(map(lambda x,y: math.fabs(x-y)/y, y2_p, y2_r)) if type(errors)==types.NoneType: errors = pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i]) else: errors = errors.append(pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i])) errors_list = [] for i in range(1, len(years)): errors_list.append(errors.ix[i]['mean'].mean()) return rfs, errors_list
def _fit(self, image, dot, tags, boxConstraints = []): img = self.normalize(image) if type(boxConstraints) is dict: boxConstraints["boxFeatures"] = self.normalize(boxConstraints["boxFeatures"]) numFeatures = img.shape[1] if self._method == "RandomForest": from sklearn.ensemble import RandomForestRegressor as RFR regressor = RFR(n_estimators=self._ntrees,max_depth=self._maxdepth) regressor.fit(img, dot) elif self._method == "svrBoxed-gurobi": regressor = RegressorGurobi(C = self._C, epsilon = self._epsilon) regressor.fit(img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures )) #elif self._method == "svrBoxed-gurobi": # regressor = RegressorGurobi(C = self._C, epsilon = self._epsilon) # regressor.fit(img, dot, tags, self.getOldBoxConstraints(boxConstraints, numFeatures # )) elif self._method == "BoxedRegressionGurobi": regressor = RegressorC(C = self._C, epsilon = self._epsilon) regressor.fitgurobi(img, dot, tags, boxConstraints) elif self._method == "BoxedRegressionCplex": regressor = RegressorC(C = self._C, epsilon = self._epsilon) regressor.fitcplex(img, dot, tags, boxConstraints) return regressor
### Decision Tree Regression ### ################################ tree_regressor = DecisionTreeRegressor(criterion="mse") tree_regressor.fit(X, y) # Predict tree_pred = tree_regressor.predict([[6.5]]) print( 'The predicted salary of a person at 6.5 Level with Decision Tree Regression is ', tree_pred) ################################ ### Random Forest Regression ### ################################ forest_regressor = RandomForestRegressor(n_estimators=300, random_state=0) forest_regressor.fit(X, y) # Predict forest_pred = forest_regressor.predict([[6.5]]) print( 'The predicted salary of a person at 6.5 Level with Random Forest Regression is ', forest_pred) ################################ ### Visualizations ### ################################ X_grid = np.arange(min(X), max(X), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color="red")
# cv=n_folds, n_jobs=n_jobs, verbose=verbose_grid) #gs = gs.fit(X_new, y) #print(gs.scorer_) #print('best score from grid search: %.3f' % gs.best_score_) #print(gs.best_params_) #best = gs.best_params_ #n_estimators_gs = best['n_estimators'] #max_depth_gs = best['max_depth'] #max_features_gs = best['max_features'] # run some cross validation print('running cross validation to determine accuracy of model...') scores = [] splits = KFold(n_splits=n_folds, shuffle=True, random_state=random_state) for train, test in splits.split(X): tree.fit(X[train], y[train]) predicted = tree.predict(X[test]) score = mean_absolute_error(y[test], predicted) scores.append(score) print(scores) # determine which features to write to the file n_estimators = n_estimators_def max_depth = max_depth_def max_features = max_features_def score = np.mean(scores) print('writing the data to file...') params = (n_folds, n_estimators, max_depth, max_features, score) write_hyperparams(params, hyperParamFile) n_folds, n_estimators, max_depth, max_features, \
print('Número de exemplos inicial:', len(histograms)) refined_histo, refined_label = train_and_refine(histograms, pesos, 'refined') print('RANSAC') ransac = RANSACRegressor(LinearRegression(), min_samples=100) ransac.fit(histograms, labels) labels_predicted = ransac.predict(histograms[-200:]) labels_test = labels[-200:] evaluate(labels_test, labels_predicted, labels_test, labels_predicted) plt.scatter(labels_test, labels_predicted) plt.show() print('Random Forest') forest = RandomForestRegressor() forest.fit(histograms[:800], labels[:800]) labels_predicted = forest.predict(histograms[-200:]) labels_test = labels[-200:] evaluate(labels_test, labels_predicted, labels_test, labels_predicted) plt.scatter(labels_test, labels_predicted) plt.show() print() print('RETIRANDO OUTLIERS') cont = len(refined_histo) print('número de exemplos', cont) train = (cont // 5 + 1) * 4 test = (cont // 5 + 1) print(train, test) print('Quadratic - Sem outliers') quadratic = PolynomialFeatures(degree=2)
def validateRF(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 206 y = 207 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x, y): os.chdir(dir_in) #filter only .csv files tgNames = [] for file in glob.glob("*.csv"): tgNames.append(file) tg_name = sorted(tgNames)[tg] print(tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): print("this tide gauge is already taken care of") return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = [] metric_rmse = [] #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \ min_samples_leaf = 1) rf.fit(X_train, y_train) #predictions predictions = rf.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) print() metric_rmse.append( np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame( [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis=0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jan 17 11:57:18 2020 @author: edith """ import numpy as np import pandas as pd import matplotlib.pyplot as plt dataset = pd.read_csv('Position_Salaries.csv') X = dataset.iloc[:, [1]].values y = dataset.iloc[:, [2]].values t = np.array([6.5]) t = t.reshape(1, 1) from sklearn.ensemble import RandomForestRegressor RFR = RandomForestRegressor(criterion='mse', n_estimators=10, random_state=0) RFR.fit(X, y) y_pred = RFR.predict(t) X_x = np.arange(min(X), max(X), 0.01) X_x = X_x.reshape(len(X_x), 1) plt.scatter(X, y, color='red') plt.scatter(t, y_pred, color='black') plt.plot(X_x, RFR.predict(X_x), color='blue')
mse = mean_squared_error(y_test, bagging.predict(X_test)) estimators[i] = step_factor*(i+1) bagging_mse[i] = mse # Estimate the Random Forest MSE over the full number # of estimators, across a step size ("step_factor") for i in range(0, axis_step): print("Random Forest Estimator: %d of %d..." % ( step_factor*(i+1), n_estimators) ) rf = RandomForestRegressor( n_estimators=step_factor*(i+1), n_jobs=n_jobs, random_state=random_state ) rf.fit(X_train, y_train) mse = mean_squared_error(y_test, rf.predict(X_test)) estimators[i] = step_factor*(i+1) rf_mse[i] = mse # Estimate the AdaBoost MSE over the full number # of estimators, across a step size ("step_factor") for i in range(0, axis_step): print("Boosting Estimator: %d of %d..." % ( step_factor*(i+1), n_estimators) ) boosting = AdaBoostRegressor( DecisionTreeRegressor(), n_estimators=step_factor*(i+1), random_state=random_state, learning_rate=0.01
X = sc.fit_transform(X) import keras from keras.utils.np_utils import to_categorical y_binary = to_categorical(y) ''' model = DecisionTreeRegressor(max_depth=10) cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error') ''' model = RandomForestRegressor(max_depth=15, n_estimators=25, n_jobs=8) model.fit(X,y_binary) feats = {} for feature, importance in zip(df[['start_treat','doxy','ilads','buhner','cowden','liposomal','other_herbs','vitaminD','supp','oil','sugar-free','gluten-free','dairy-free','bioresonance','antimicrobial','oxygen','cannabis_oil','binaural','tobacco','alcohol','coffee','marijuana','other_stim','num_antibiotics','method_antibiotics']], model.feature_importances_): feats[feature] = importance #add the name/value pair scores = cross_val_score(model, X, y_binary, cv=3, scoring='neg_mean_absolute_error') np.mean(scores), np.std(scores) #adding feature importances MostImportant = model.feature_importances_ importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'}) importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90) #predicting model.predict(X) y_pred = model.predict(X)
plt.bar(range(train.shape[1]), importances[index], color="r", yerr=std[index], align="center") plt.xticks(range(train.shape[1]), index, rotation='vertical') plt.xlim([-1, train.shape[1]]) ax.set_xticklabels(ordered_labels) plt.show() # In[8]: # Retrain the model on best settings best_forest = RandomForestClassifier(n_estimators=500, criterion='entropy') best_forest.fit(train, adoptionSpeed_train) forest_predicted = best_forest.predict(test) print(accuracy_score(adoptionSpeed_test, forest_predicted)) # In[11]: # Instantiate model with 1000 decision trees rf = RandomForestRegressor(n_estimators = 1000, random_state = 42, criterion='mse') # Train the model on training data rf.fit(train, adoptionSpeed_train); forest_regr = rf.predict(test) print(mean_squared_error(adoptionSpeed_test, forest_predicted))
how='left') train.fillna(0, inplace=True) feats = [ 'weights_sum', 'weights_mean', 'order_weight_max', 'order_weight_count', 'order_weight_sum', 'time_weight_max', 'time_weight_sum', 'days_sum', 'days_max', 'days_min', 'days_count', 'mean_gap', 'weights', 'product_user_reorder_ratio', 'product_reorder_ratio', 'product_user_ratio', 'aisle_reorder_ratio', 'dept_reorder_ratio' ] gc.collect() print("running random forest..........") rf = RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1) rf.fit(train[feats], train.y) train['y_rf'] = rf.predict(train[feats]) gc.collect() print("running xgboost..........") model = XGBRegressor() model.fit(train[feats], train.y) train['y_xgb'] = model.predict(train[feats]) gc.collect() def getProduct(row): l = int(np.ceil(row['average_product_per_order'])) return ' '.join([str(x) for x in row['product_id'][:l]])
# -*- coding: utf-8 -*- """ Created on Fri Dec 21 17:29:04 2018 @author: Ashlin """ import pandas as pd from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt import numpy as np mydata=pd.read_csv("Position_Salaries.csv") X=mydata.iloc[:,1:2] y=mydata.iloc[:,-1] regressor=RandomForestRegressor(max_features='sqrt',n_estimators=300,criterion='mse',random_state=0) regressor.fit(X,y) plt.title("Regression") plt.xlabel("X") plt.ylabel("Predicted Value") X_grid=np.arange(min(X.values),max(X.values),0.01) X_grid=X_grid.reshape(len(X_grid),1) plt.scatter(X,y,color='blue',label="Actual") plt.plot(X_grid,regressor.predict(X_grid),color='red',label="RFR") plt.legend() plt.show() prediction=regressor.predict(6.5) print("The predicted value for the Salary is %0.4f" % (prediction))
class RandomTree(object): def __init__(self, train, test): ''' prepare datas :param train: training data :param test: testing data ''' self.train = train self.test = test # prepare data for i in range(1, 7): self.train['RSSI_' + str(i)] = abs(self.train['RSCP_' + str(i)] - self.train['EcNo_' + str(i)]) self.test['RSSI_' + str(i)] = abs(self.test['RSCP_' + str(i)] - self.test['EcNo_' + str(i)]) self.total = self.test.append(self.train, ignore_index=True) self.tests = self.total.ix[:, [ u'SRNCID', u'BestCellID', u'RSSI_1', u'RSSI_2', u'RSSI_3', u'RSSI_4', u'RSSI_5', u'RSSI_6' ]] self.regressor_y = self.total[['Longitude', 'Latitude']] self.classifier_y = self.total['GridID'] self.regressor = RandomForestRegressor(random_state=0, n_estimators=10) self.classifier = RandomForestClassifier(n_estimators=10) self.classifier_train_x, self.classifier_test_x, self.classifier_train_y, self.classifier_test_y = \ train_test_split(self.tests, self.classifier_y, test_size=0.2) self.regressor_train_x, self.regressor_test_x, self.regressor_train_y, self.regressor_test_y = \ train_test_split(self.tests, self.regressor_y, test_size=0.2) # calculate center mark self.min_location = utm.from_latlon(self.total['Latitude'].min(), self.total['Longitude'].min()) self.max_location = utm.from_latlon(self.total['Latitude'].max(), self.total['Longitude'].max()) width = self.max_location[1] - self.min_location[1] height = self.max_location[0] - self.min_location[0] self.grid_x = math.ceil(width / 20) self.grid_y = math.ceil(height / 20) def findCenter(self, num): ''' find the center :param num: grid id :return: center mark ''' dr = math.ceil(num / self.grid_x) dc = num % self.grid_y c_x = self.min_location[1] + dc * 20 - 10 c_y = self.min_location[0] + dr * 20 - 10 return [c_x, c_y] def distance(self, lo1, la1, lo2, la2): ''' calculate distance :param lo1: longitude1 :param la1: latitude1 :param lo2: longitude2 :param la2: latitude2 :return: distance ''' dlon = lo2 - lo1 dlat = la2 - la1 return math.sqrt(dlon * dlon + dlat * dlat) def predict(self): ''' train ans predict :return: regressor_res and classifier_res ''' self.classifier_train_x, self.classifier_test_x, self.classifier_train_y, self.classifier_test_y = \ train_test_split(self.tests, self.classifier_y, test_size=0.2) self.regressor_train_x, self.regressor_test_x, self.regressor_train_y, self.regressor_test_y = \ train_test_split(self.tests, self.regressor_y, test_size=0.2) self.regressor.fit(self.regressor_train_x, self.regressor_train_y) self.classifier.fit(self.classifier_train_x, self.classifier_train_y) regressor_res = self.regressor.predict(self.regressor_test_x) classifier_res = self.classifier.predict(self.classifier_test_x) r_score = self.regressor.score(self.regressor_test_x, self.regressor_test_y) c_score = self.classifier.score(self.classifier_test_x, self.classifier_test_y) print 'regressor score: ' + str(r_score) print 'classifer score: ' + str(c_score) return regressor_res, classifier_res, r_score, c_score def compare(self): ''' compare :return: ''' regressor_com = [] classifier_com = [] r_score = [] c_score = [] for i in range(0, 10): regressor_res, classifier_res, r, c = self.predict() r_score.append(r) c_score.append(c) self.regressor_test_y.index = range(0, len(self.regressor_test_y)) regressor_res = pd.DataFrame(regressor_res, columns=['PLO', 'PLA']) c_ls = [] for i in range(len(classifier_res)): center = self.findCenter(classifier_res[0]) c_ls.append( utm.to_latlon(center[1], center[0], self.min_location[2], self.min_location[3])) c_ls = pd.DataFrame(c_ls, columns=['PLA', 'PLO']) r_eval = pd.concat([self.regressor_test_y, regressor_res], axis=1) c_eval = pd.concat([self.regressor_test_y, c_ls], axis=1) for i in range(0, len(r_eval)): r_dis = self.distance(r_eval.loc[i, 'Longitude'], r_eval.loc[i, 'Latitude'], r_eval.loc[i, 'PLO'], r_eval.loc[i, 'PLA']) c_dis = self.distance(c_eval.loc[i, 'Longitude'], c_eval.loc[i, 'Latitude'], c_eval.loc[i, 'PLO'], c_eval.loc[i, 'PLA']) regressor_com.append(r_dis) classifier_com.append(c_dis) # r_score = np.average(r_score) # c_score = np.average(c_score) plt.plot(r_score, color='red') plt.xlabel('time') plt.ylabel('average') plt.show() plt.plot(c_score, color='blue') plt.xlabel('time') plt.ylabel('average') plt.show() regressor_com.sort() classifier_com.sort() plt.plot(regressor_com, color='red') plt.xlabel('index') plt.ylabel('distance') plt.show() plt.plot(classifier_com, color='blue') plt.xlabel('index') plt.ylabel('distance') plt.show()
class Solution(object): def __init__(self): self.dataframe_all = su.load() def setup_training(self): ''' Fits a regression model to the training data. ''' split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split( self.dataframe_all, self.dataframe_all["income_cat"]): self.strat_train_set = self.dataframe_all.loc[train_index] self.strat_test_set = self.dataframe_all.loc[test_index] self.dataframe_all = self.strat_train_set.drop( "median_house_value", axis=1) # drop labels for training set self.feature_labels = self.strat_train_set["median_house_value"].copy() def preprocess(self): self.prepared_data = prep.process_pipeline(self.dataframe_all) def predict_values(self): ''' Makes predictions using a fit classifier based on F1 score. ''' self.forest_reg = RandomForestRegressor(random_state=42) self.forest_reg.fit(self.prepared_data, self.feature_labels) price_predictions = self.forest_reg.predict(self.prepared_data) forest_mse = mean_squared_error(self.feature_labels, price_predictions) forest_rmse = np.sqrt(forest_mse) print(" Forest RMSE ", forest_rmse) forest_scores = cross_val_score(self.forest_reg, self.prepared_data, self.feature_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-forest_scores) print("Scores:", forest_rmse_scores) print("Mean:", forest_rmse_scores.mean()) print("Standard deviation:", forest_rmse_scores.std()) def grid_search(self): param_grid = [ # try 12 (3×4) combinations of hyperparameters { 'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8] }, # then try 6 (2×3) combinations with bootstrap set as False { 'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4] }, ] forest_reg = self.forest_reg # train across 5 folds, that's a total of (12+6)*5=90 rounds of training grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True) grid_search.fit(self.prepared_data, self.feature_labels) return grid_search.best_estimator_ def test(self, final_model): self.final_model = final_model X_test = self.strat_test_set.drop("median_house_value", axis=1) y_test = self.strat_test_set["median_house_value"].copy() X_test_prepared = prep.process_pipeline(X_test) final_predictions = final_model.predict(X_test_prepared) some_data = X_test.iloc[:10] some_labels = y_test[:10] some_data_prepared = prep.process_pipeline(some_data) print("Predictions:", final_model.predict(some_data_prepared)) print("Labels:", list(some_labels)) final_mse = mean_squared_error(y_test, final_predictions) final_rmse = np.sqrt(final_mse) print(" Final RMSE ", final_rmse)
# to fit the model with and without some variables # Reordering our dataset X = X[[ 'Absolute Magnitude', 'Est Dia in M(average)', 'Relative Velocity km per sec', 'Miss Dist.(kilometers)', 'Minimum Orbit Intersection', 'Eccentricity', 'Semi Major Axis', 'Inclination', 'Asc Node Longitude', 'Perihelion Distance', 'Perihelion Arg', 'Perihelion Time', 'Mean Anomaly' ]] # Using Random Forest Feature importance to select the most important features from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(random_state=1, max_depth=10) model.fit(X, y) features = X.columns importances = model.feature_importances_ indices = np.argsort(importances)[-1:-4:-1] plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), [features[i] for i in indices]) plt.xlabel('Relative Importance') plt.show() # Well we can clearly see in the feature importance graph that there are just three # variables which contributes by more than 96% to the target then all the other # variables, the other variables contribute with less than 1%, so we will just # keep only those three variables:
age = 43 f'Hello {name.upper()}, you are {age}' # 1.1 we can even store PATH of a directory and use {PATH} to read files # 1.2 notes on read_csv: use low_memory and parse_date every time. df_raw = pd.read_csv(f'{PATH}Train.csv', low_memory=False, parse_date=['saledate']) # 2. a trick of display long columns lists df_raw.tail().transpose() # 3. Fix Stationary problem: log transformation: df_raw.SalePrice = np.log(df_raw.SalePrice) # 4. a fast way to initiate a ML model m = RandomForestRegressor(n_jobs=-1) m.fit(df_raw.drop('SalePrice', axis=1), df_raw.SalsPrice) # 5. Fastai tools # 5.1 strip datetimes and assing them to different columns add_datepart(df_raw, 'saledate') # 6. CleanData # 6.1 Access date parts (You can access any part after the dt.) df_raw.saledate.dt. # 6.2 Access catgorical parts (here it returns all categoriies) df_raw.UsageBand.cat.categories df # 6.2.1 We can reorder the categories (to make the order of the corersponiding numerics more meaningful) df_raw.UsageBand.cat.set_categoreis(['High, 'Medium', 'Low'], ordered=True, inplace=True)
Y_train_pred = plr.predict(X_train) Y_test_pred = plr.predict(X_test) print(plr.score(X_test,Y_test)) # Already good. Our model predicts well the cost of treatment of patients. I think we could limit ourselves to creating two or three polynomial features, but the data set is so small, so we went the easy way. # And finally try RandomForestRegressor. I've never used this algorithm in regression analysis. # In[ ]: forest = RandomForestRegressor(n_estimators = 100, criterion = 'mse', random_state = 1, n_jobs = -1) forest.fit(x_train,y_train) forest_train_pred = forest.predict(x_train) forest_test_pred = forest.predict(x_test) print('MSE train data: %.3f, MSE test data: %.3f' % ( mean_squared_error(y_train,forest_train_pred), mean_squared_error(y_test,forest_test_pred))) print('R2 train data: %.3f, R2 test data: %.3f' % ( r2_score(y_train,forest_train_pred), r2_score(y_test,forest_test_pred))) # In[ ]: pl.figure(figsize=(10,6))
class RandomForest: def __init__(self, criterion, max_features, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, bootstrap, max_leaf_nodes, min_impurity_decrease, random_state=None, n_jobs=1): self.n_estimators = self.get_max_iter() self.criterion = criterion self.max_features = max_features self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.bootstrap = bootstrap self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.random_state = random_state self.n_jobs = n_jobs self.estimator = None self.time_limit = None @staticmethod def get_max_iter(): return 100 def get_current_iter(self): return self.estimator.n_estimators def fit(self, X, y, sample_weight=None): from sklearn.ensemble import RandomForestRegressor if self.estimator is None: self.n_estimators = int(self.n_estimators) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) if self.max_features not in ("sqrt", "log2", "auto"): max_features = int(X.shape[1] ** float(self.max_features)) else: max_features = self.max_features self.bootstrap = check_for_bool(self.bootstrap) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) # initial fit of only increment trees self.estimator = RandomForestRegressor( n_estimators=self.get_max_iter(), criterion=self.criterion, max_features=max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, bootstrap=self.bootstrap, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, n_jobs=self.n_jobs, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) return self def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X)
rmse = metrics.mean_squared_error(y_train, y_pred, squared=False) print("Train Lasso r2-score is {}".format(r2)) print("Train Lasso RMSE is {}".format(rmse)) # lasso feature selection sel = SelectFromModel(lasso) sel.fit(X_train_chi_sel, y_train) selected_feat = X_train_chi_sel[:, sel.get_support()] X_train_selected = sel.transform(X_train_chi_sel) X_test_selected = sel.transform(test_chi_sel[:, :-1]) print("datasets trasformed to {} features...".format(X_train_selected.shape[1])) # Random Forest clf = RandomForestRegressor(random_state=0) clf.fit(X_train_selected, y_train) y_pred = clf.predict(X_test_selected) r2 = metrics.r2_score(y_test, y_pred) rmse = metrics.mean_squared_error(y_test, y_pred, squared=False) print("\nRANDOM FOREST") print('selected features by lasso: {}'.format(X_train_selected.shape[1])) print("Test Random forest r2-score is {}".format(r2)) print("Test Random forest RMSE is {}".format(rmse)) y_pred = clf.predict(X_train_selected) r2 = metrics.r2_score(y_train, y_pred) rmse = metrics.mean_squared_error(y_train, y_pred, squared=False) print("Train Random forest r2-score is {}".format(r2))
param1 = {'n_estimators': [100, 500, 50]} model1 = GridSearchCV(estimator=rf, param_grid=param1, scoring='neg_mean_squared_error', cv=5) model1.fit(windspeed_trainX, windspeed_trainY) model1.best = model1.best_params_ print('model1 best param:', model1.best_params_) print('model1 best score:', model1.best_score_) param2 = {'max_depth': [5, 10, 15], 'min_samples_split': [10, 5, 2]} model2 = GridSearchCV(estimator=RandomForestRegressor(random_state=10, n_estimators=450), param_grid=param2, scoring='neg_mean_squared_error', cv=5) model2.fit(windspeed_trainX, windspeed_trainY) model2.best = model2.best_params_ print('model2 best param:', model2.best_params_) print('model2 best score:', model2.best_score_) # 选择最优参数进行预测 speed_model = RandomForestRegressor(n_estimators=450, random_state=10, max_depth=10, min_samples_split=5) speed_model.fit(windspeed_trainX, windspeed_trainY) windspeed_testY = speed_model.predict(windspeed_testX) data.loc[data.windspeed == 0, 'windspeed'] = windspeed_testY # 填充后的数据特征分布 fig, axes = plt.subplots(2, 2) plt.subplots_adjust(wspace=0.3, hspace=0.5) sn.distplot(data['temp'], ax=axes[0, 0]) sn.distplot(data['atemp'], ax=axes[0, 1]) sn.distplot(data['humidity'], ax=axes[1, 0]) sn.distplot(data['windspeed'], ax=axes[1, 1]) axes[0, 0].set(xlabel='temp', title='气温分布') axes[0, 1].set(xlabel='atemp', title='体感温度分布') axes[1, 0].set(xlabel='humidity', title='湿度分布') axes[1, 1].set(xlabel='windspeed', title='风速分布') plt.savefig('修正后分布分析.png')
plt.xticks(fontsize=10) plt.yticks(fontsize=10) locs, labels = plt.xticks() plt.title("Confusion matrix (Decision tree)", fontsize=15) # Random forest heart_features = [ 'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS', 'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen', 'Major_Vessels', 'Thalassemia' ] features = heart_data[heart_features] train_features, val_features, train_target, val_target = train_test_split( features, target, random_state=0) forest_model = RandomForestRegressor(n_estimators=100, random_state=0) forest_model.fit(train_features, train_target) melb_preds = forest_model.predict(val_features) print('MAE_random_forrest:') MAE_RF = mean_absolute_error(melb_preds, melb_preds) print(MAE_RF) # random forest - cross validation heart_features = [ 'Age', 'Gender', 'Chest_Pain', 'Resting_BP', 'Cholesterol', 'Fasting_BS', 'RECG', 'Max_Heart_Rate', 'Exercise_Ang', 'ST_Depression', 'ST_Segmen', 'Major_Vessels', 'Thalassemia' ] features = heart_data[heart_features] my_pipeline = Pipeline( steps=[('preprocessor', SimpleImputer() ), ('model',
png = graph.create_png() graph.write_png("decision_tree.png") im = Image.open('decision_tree.png') #im.show() # 训练集和测试集区分,构造决策树并预测精度 data_train, data_test, target_train, target_test = \ train_test_split(housing.data, housing.target, test_size = 0.1, random_state = 42) dtr = tree.DecisionTreeRegressor(random_state=42) dtr.fit(data_train, target_train) accuracy_dtr = dtr.score(data_test, target_test) print('决策树的精度为:', accuracy_dtr) #随机森林 rfr = RandomForestRegressor(random_state=42) rfr.fit(data_train, target_train) accuracy_rfr = rfr.score(data_test, target_test) print('随机森林的精度为:', accuracy_rfr) #选择合适的参数 tree_param_grid = { 'min_samples_split': list((3, 6)), 'n_estimators': list((50, 100)) } #'min_samples_split': list((3,6,9)):以min_samples_split为标准,3,6,9哪个参数效果好?cv=5交叉验证5次 grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_param_grid, cv=5) grid.fit(data_train, target_train) cv_results_ = grid.cv_results_ best_params_ = grid.best_params_ best_score_ = grid.best_score_ print(cv_results_)
def Random_forest_regress(X_train,X_test,y_train,y_test,CARE_df,n_estimators,name): regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=43,min_samples_leaf= 2, max_features="sqrt", max_depth= 12, bootstrap= True) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) logit_roc_auc = roc_auc_score(np.where(y_test > 0, 1, 0), y_pred) plt.figure() plt.plot(y_test,y_pred,'o') plt.xlabel("Test Set") plt.ylabel("Prdiction- binary") plt.title(f'{name}- results') plt.show() # plt.savefig(f'/home/michal/MYOR Dropbox/R&D/Allergies Product Development/Prediction/Algorithm_Beta/18_01_2021_CARE_results/{name}-results-RandomForest.jpeg') logit_roc_auc = roc_auc_score(np.where(y_test > 0, 1, 0), y_pred) fpr, tpr, thresholds = roc_curve(np.where(y_test > 0, 1, 0), y_pred) # # export to excel # df = pd.DataFrame(data={'fpr': fpr, 'tpr': tpr, 'threshold':thresholds}) # df.to_excel(f'/home/michal/MYOR Dropbox/R&D/Allergies Product Development/Prediction/Algorithm_Beta/18_01_2021_CARE_results/{name}_randomForestValues.xlsx',index=False) CARE_predict=regressor.predict(CARE_df) accuracy=[] specificity=[] sensitivity=[] pred_yes=[] percent_yes=[] for threshold in thresholds: tn, fp, fn, tp = confusion_matrix(np.where(y_test > 0, 1, 0), np.where(y_pred > threshold, 1, 0).reshape(-1)).ravel() accuracy_score=(tn+tp)/(tn+fp+fn+tp) specificity_score = tn / (tn + fp) sensitivity_score=tp/(tp+fn) accuracy.append(accuracy_score) specificity.append(specificity_score) sensitivity.append(sensitivity_score) pred_yes.append(sum(np.where(CARE_predict > threshold, 1, 0))) percent_yes.append((sum(np.where(CARE_predict > threshold, 1, 0)))/len(CARE_predict)) df = pd.DataFrame(data={'thresholds': thresholds, 'specificity': specificity, 'sensitivity': sensitivity,'pred_yes':pred_yes,'percent_yes':percent_yes}) df.to_excel(f'/home/michal/MYOR Dropbox/R&D/Allergies Product Development/Prediction/Algorithm_Beta/18_01_2021_CARE_results/{name}_CARE_values_forest_1.xlsx',index=False) index_80=np.argwhere(np.array(sensitivity)>0.8)[0][0] index_65=np.argwhere(np.array(sensitivity)>0.65)[0][0] plt.figure() plt.plot(fpr, tpr, label='AUC = %0.2f' % logit_roc_auc) plt.plot(sensitivity,specificity, label="recall vs. specificity") plt.plot(sensitivity[np.argmax(accuracy)],specificity[np.argmax(accuracy)],'o') plt.text(sensitivity[np.argmax(accuracy)]-0.1, specificity[np.argmax(accuracy)]-0.1,f'Threshold for max\naccuracy={round(thresholds[np.argmax(accuracy)],2)}') plt.plot(sensitivity[index_80], specificity[index_80],'o') plt.text(sensitivity[index_80]-0.1, specificity[index_80]-0.1,f'recall={round(sensitivity[index_80],2)}, spec={round(specificity[index_80],2)}\n Threshold={round(thresholds[index_80],2)}') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(f'Random Forest- {name}\n #of trees={n_estimators},Max accuracy={round(max(accuracy),2)}') plt.legend(loc="lower right") plt.show()
def routes(): chosenroute = request.form.get('chosenroute') chosenorigin = request.form.get('chosenorigin') chosendestination = request.form.get('chosendestination') chosenday = request.form.get('chosenday') chosentime = request.form.get('chosentime') chosentemp = request.form.get('chosentemp') chosenhumid = request.form.get('chosenhumid') chosenpres = request.form.get('chosenpres') #run the prediction model dataframe = pd.read_csv('cleangps.csv') array = dataframe.values X = array[:, 0:7] Y = array[:, 7] test_size = 0.33 seed = 7 X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, Y, test_size=test_size, random_state=seed) # Fit the model on 33% model = RandomForestRegressor() model.fit(X_train, Y_train) # save the model to disk filename = 'finalized_model.sav' pickle.dump(model, open(filename, 'wb')) #calculating the average time between two adjacent stops chosend = float(re.search(r'\d+', chosenday).group()) chosent = float(re.search(r'\d+', chosentime).group()) chosenro = float(re.search(r'\d+', chosenroute).group()) chosenro1 = str(re.search(r'\d+', chosenroute).group()) #chosenro1=chosenroute.split(":") #value= str(chosenro1[0]) chosenorig = float(re.search(r'\d+', chosenorigin).group()) chosendest = float(re.search(r'\d+', chosendestination).group()) data = [] for i in range(0, len(X)): if X[i][0] == chosenro and X[i][2] == chosent and X[i][3] == chosend: data.append(X[i]) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) #calculating the time between adjacent stops result = loaded_model.predict(data) total = 0 for j in range(len(result)): total += (result[j]) seconds = (total // len(result)) #calculating the number of stops between origin and destination df = pd.read_csv('stops.csv') arr = df.values list = [] for j in range(len(arr)): if (arr[j][0]) == chosenorig or (arr[j][0]) == chosendest: list.append(arr[j]) list1 = [] for i in range(len(list)): if (list[i][4]) == chosenro1: list1.append(list[i]) nums = abs(list1[0][6] - list1[1][6]) second = seconds * nums times = str(datetime.timedelta(seconds=second)) #calculating the predict time between origin and destination #list =[] #for i in range(len(data)): #if data[i][1]==chosenorig: #list.append(i) #if data[i][1]==chosendest: #list.append(i) #result = loaded_model.predict(data[list[0]:list[1]]) #total = 0 #for i in range(len(result)): #total += (result[i]) #time = total//60 #create variables for time +1 hour and time -1hour chosentime1 = chosent + 1 chosentime2 = chosent - 1 data1 = [] for i in range(0, len(X)): if X[i][0] == chosenro and X[i][2] == chosentime1 and X[i][ 3] == chosend: data1.append(X[i]) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) #calculating the time between adjacent stops result1 = loaded_model.predict(data1) total1 = 0 for j in range(len(result1)): total1 += (result1[j]) seconds = (total1 // len(result1)) second1 = seconds * nums time1 = str(datetime.timedelta(seconds=second1)) # model 3 data2 = [] for i in range(0, len(X)): if X[i][0] == chosenro and X[i][2] == chosentime1 and X[i][ 3] == chosend: data2.append(X[i]) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) #calculating the time between adjacent stops result2 = loaded_model.predict(data2) total2 = 0 for j in range(len(result2)): total2 += (result2[j]) seconds = (total2 // len(result2)) second2 = seconds * nums time2 = str(datetime.timedelta(seconds=second2)) return render_template("display.html", chosenroute=chosenroute, chosenorigin=chosenorigin, chosendestination=chosendestination, chosent=chosent, chosentime1=chosentime1, chosentime2=chosentime2, times=times, time1=time1, time2=time2)
#import numpy as np import matplotlib.pyplot as plt import pandas as pd # load data datas = pd.read_csv("maaslar.csv") x = datas.iloc[:, 1:2] y = datas.iloc[:, 2:] from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor( n_estimators=10, random_state=0) # n_estimators : how many decision trees will be created rfr.fit(x, y) print(rfr.predict(6.6)) print(rfr.predict(11)) z = x + 0.5 k = x - 0.5 plt.scatter(x, y, color="red") plt.plot(x, rfr.predict(x), color="blue") plt.plot(x, rfr.predict(z), color="green") plt.plot(x, rfr.predict(k), color="yellow") from sklearn.metrics import r2_score print(r2_score(y, rfr.predict(x)))
#compare actual vs predicted values df_output = pd.DataFrame({'Actual': test_y, 'Predicted': pred}) df_output # Calculate mean absolute percentage error (MAPE) mape = 100 * (metrics.mean_absolute_error(test_y, pred) / test_y) # Calculate and display accuracy accuracy = 100 - np.mean(mape) print('Accuracy of Linear Regression:', round(accuracy, 2), '%.') ################################################################################################################### # random forest model ################################################################################################################### model = RandomForestRegressor() model.fit(train_x,train_y) # Get the mean absolute error on the test data : pred = model.predict(test_x) # Calculate mean absolute percentage error (MAPE) mape = 100 * (metrics.mean_absolute_error(test_y, pred) / test_y) # Calculate and display accuracy accuracy = 100 - np.mean(mape) print('Accuracy of Random Forest Regressor:', round(accuracy, 2), '%.') ################################################################################################################### #XGBoost Model ################################################################################################################### XGBModel = xgb.XGBRegressor() XGBModel.fit(train_x,train_y , verbose=False)
import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.datasets import make_regression X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100) regr.fit(X, y) X print(regr.feature_importances_) print(regr.predict([[0, 0, 0, 0]])) for i in range(20): l = list(np.round(np.random.random(4), 2)) print(l, ' ', np.round(regr.predict([l]), 2))
'RAD': RAD, 'TAX': TAX, 'PTRATIO': PTRATIO, 'B': B, 'LSTAT': LSTAT} features = pd.DataFrame(data, index=[0]) return features df = user_input_features() # Main Panel # Print specified input parameters st.header('Specified Input parameters') st.write(df) st.write('---') # Build Regression Model model = RandomForestRegressor() model.fit(X, Y) # Apply Model to Make Prediction prediction = model.predict(df) st.header('Prediction of MEDV') st.write(prediction) st.write('---')
#split data into x and Y axis x = ds.iloc[:, :-1].values y = ds.iloc[:, 1].values #Devide the dataset into training and test dataset from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 / 4, random_state=0) # Fitting Random Forest Regression to the dataset from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=10, random_state=0) regressor.fit(x, y) # Predicting a test result y_pred = regressor.predict(x_test) # Visualising the Random Forest Regression results (higher resolution) X_grid = np.arange(min(x), max(x), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(x_train, y_train, color='green') plt.scatter(x_test, y_test, color='red') plt.plot(X_grid, regressor.predict(X_grid), color='blue') plt.title('Truth or Bluff (Random Forest Regression)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
df_train_feature,df_test_feature, df_train_goal, df_test_goal = train_test_split(x, y, test_size=0.33, random_state=42) #回归模型算法 #clf=GradientBoostingRegressor( learning_rate= 0.01,n_estimators=300) #clf = LinearRegression() clf = RandomForestRegressor(n_estimators=400) #clf = Ridge(alpha=0.03) #clf = Lasso() #clf =SVR() #神经网络,双隐层,一层28神经元,二层16神经元 # model = Sequential() # model.add(Dense(28,activation='relu',input_dim=10)) # model.add(Dense(16,activation='relu')) # model.add(Dense(1)) # model.compile(optimizer='adam',loss='mse') # model.fit(df_train_feature,df_train_goal,epochs=1000,batch_size=4,verbose=2) clf.fit(df_train_feature, df_train_goal) #验证数据集 x_value = df_value[['AQI指数','PM2.5','PM10','So2','No2','Co','O3','最高气温','最低气温']] #x_value = df_value[['AQI指数','So2','No2','O3','最低气温']] y_value = df_value['急诊人次'] #开始进行数据预测 prediction = clf.predict(x_value) prediction = pd.DataFrame(prediction) #处理验证集数据 y_value= y_value.reset_index() y_value = y_value['急诊人次'] #评价指标 r2 = r2_score(y_value,prediction) print(r2) df_wucha = pd.concat([prediction,y_value],axis =1) df_wucha.columns = ['预测急诊人次','实际急诊人次']
# COMMAND ---------- X_train, X_test, y_train, y_test = train_test_split( d[['SoPOR', 'SoPDel']], d['SOBD_CSEVAD_Difference'], test_size=0.33, random_state=42) # COMMAND ---------- # Create linear regression object regr = RandomForestRegressor(max_depth=10) # Train the model using the training sets regr.fit(X_train, y_train) # COMMAND ---------- # Make predictions using the testing set y_pred = regr.predict(d[['SoPOR', 'SoPDel']]) # COMMAND ---------- import pyspark from pyspark.sql.functions import * from pyspark.sql import DataFrame from pyspark.sql.types import * from pyspark.ml.feature import * # COMMAND ----------
#---------------------------------------------------------------------------------------------------- #Array Creation for med1 x1 = list() y1 = list() for i in range(len(med1)-10): x = np.array(med1.loc[i:i+9]) y = np.array(med1.loc[i+10]) x1.append(x) y1.append(y) x1 = np.array(x1).reshape(292,10) y1 = np.array(y1).reshape(292,) #Random Forests 85% reg = RandomForestRegressor(max_depth=10,random_state=10) reg.fit(x1,y1) print(reg.score(x1,y1)) p = reg.predict(x1) #Save model1 filename = 'model1.sav' pickle.dump(reg, open(filename, 'wb')) model1 = pickle.load(open(filename, 'rb')) #---------------------------------------------------------------------------------------------------- # Array Creation for med2 x2 = list() y2 = list() for i in range(len(med2)-10): x = np.array(med2.loc[i:i+9])