def main(): fi = open('25-75_microcap_list.txt', 'r') symbols = [] for i in fi: symbols.append(i.strip()) #symbols = symbols[0:6] train, test = get_data(symbols, n = 30, flag = 1, blag = 12) train = train.replace([np.inf, -np.inf], np.nan) test = test.replace([np.inf, -np.inf], np.nan) train = train.dropna(axis=0) test = test.dropna(axis=0) print 'Fitting\n' m = RandomForestRegressor(n_estimators=250, n_jobs=1) m.fit(train.ix[:,6:], train.ix[:,5]) print 'Predicting\n' preds = m.predict(test.ix[:,5:]) result = test.ix[:,:4] result['Prediction'] = preds result = result.sort('Prediction', ascending=False) print result.head() result.to_csv('trade_result.csv', sep = ',', index = False)
def set_missing_ages(df): # 把已有的数值型特征取出来丢进Random Forest Regressor中 age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']] # 乘客分成已知年龄和未知年龄两部分 known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() # y即目标年龄 y = known_age[:, 0] # X即特征属性值 X = known_age[:, 1:] # fit到RandomForestRegressor之中 rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(X, y) # 用得到的模型进行未知年龄结果预测 predictedAges = rfr.predict(unknown_age[:, 1::]) # 用得到的预测结果填补原缺失数据 df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges return df, rfr
def get_preds(features, trees=3000, depth=19): # features is the number of latents features that I want the nmf to run on # Create dataframes df = get_nmf(k=features) df_full = add_yahoo_to_df(df) df_train = add_dummies(df_full) # Why aren't you using df_full? df_test = get_nmf('data_wednesday', k=features) # put in folder name where the json data is df_test_full = add_yahoo_to_df(df_test) df_test_full = add_dummies(df_test_full) # Create models X_model_class, y_model_class = get_classifier_data(df_full) rf_class = RandomForestClassifier(n_estimators=trees, max_depth=depth) rf_class.fit(X_model_class, y_model_class) # X_model_regress, y_model_regress = get_regressor_data(df_full) rf_regress = RandomForestRegressor(n_estimators=trees, max_depth=depth) rf_regress.fit(X_model_regress, y_model_regress) # Get X and y values X_classify, y_classify = get_classifier_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) X_regress, y_regress = get_regressor_data(pd.DataFrame(df_test_full.ix['2016-04-11'])) # Run models classifier_preds = rf_class.predict(X_classify) classifier_accuracy = accuracy_score(classifier_preds, y_classify) regressor_preds = rf_regress.predict(X_regress) regressor_mse = mean_squared_error(regressor_preds, y_regress) # I want to return the number of features, k, along with the accuracy of the classifier # and the MSE of the regressor. This will give me an idea of how well things are doing # based on the number of features. return [features, classifier_accuracy, regressor_mse]
def train_year(train_fea, trees): values = train_fea['SaleYear'].values years = sorted(list(set(values))) rfs =[] for i in range(0, len(years)): print 'train model %d' % (years[i]) rf = RandomForestRegressor(n_estimators=trees, n_jobs=1, compute_importances = True) y = train_fea[train_fea['SaleYear']==years[i]] y_fea = y.copy() del y_fea['SalePrice'] rf.fit(y_fea, y["SalePrice"]) rfs.append(rf) errors = None for i in range(1, len(years)): pairs = get_pairs(years, i) for p in pairs: print 'compare %d, %d' % (p[0], p[1]) y1 = train_fea[train_fea['SaleYear']==p[0]] y2 = train_fea[train_fea['SaleYear']==p[1]] y1_fea, y2_fea = y1.copy(), y2.copy() del y1_fea['SalePrice'] del y2_fea['SalePrice'] rf = rfs[years.index(p[0])] y2_p = rf.predict(y2_fea) y2_r = np.array([v for v in y2['SalePrice']]) error_rates = np.array(map(lambda x,y: math.fabs(x-y)/y, y2_p, y2_r)) if type(errors)==types.NoneType: errors = pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i]) else: errors = errors.append(pd.DataFrame({'dist':i, 'mean':error_rates.mean(), 'var':error_rates.var(), 'std':error_rates.std()}, index=[i])) errors_list = [] for i in range(1, len(years)): errors_list.append(errors.ix[i]['mean'].mean()) return rfs, errors_list
def cross_validate(features_target): features = features_target[0] target = features_target[1] rf = RandomForestRegressor( n_estimators=100, verbose=2, n_jobs=1, min_samples_split=10, compute_importances=True, random_state=1 ) cv = cross_validation.KFold(len(features), n_folds=10, indices=False) # iterate through the training and test cross validation segments and # run the classifier on each one, aggregating the results into a list results = [] i = 1 for traincv, testcv in cv: print "Running fold " + str(i) fit = rf.fit(features[traincv], target[traincv]) predictions = fit.predict(features[testcv]) predictions = predictions.flatten() for j in range(len(predictions)): results.append((target[testcv][j], predictions[j])) importance(rf) i = i + 1 combined_auc(results)
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def pipeline(): val = data[data.watch==1] val_a_b = val[['item_id','store_code','a','b']] val_y = val.label val_x = val.drop(['label','watch','item_id','store_code','a','b'],axis=1) train = data[(data.watch!=1)&(data.watch!=0)] train_y = train.label a = list(train.a) b = list(train.b) train_weight = [] for i in range(len(a)): train_weight.append(min(a[i],b[i])) train_weight = np.array(train_weight) train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1) train_x.fillna(train_x.median(),inplace=True) val_x.fillna(val_x.median(),inplace=True) model = RandomForestRegressor(n_estimators=500,max_depth=5,max_features=0.6,n_jobs=-1,random_state=1024) #train model.fit(train_x,train_y, sample_weight=train_weight) #predict val set val_a_b['pred'] = model.predict(val_x) val_a_b['y'] = val_y cost = cal_cost(val_y.values,val_a_b.pred.values,val_a_b.a.values,val_a_b.b.values) val_a_b.to_csv('val_{0}.csv'.format(cost[1]),index=None)
def backward_best_features_per_cluster(X, Y, all_feature_metadata): best_features_per_cluster = {} for c in sorted(X['cluster'].unique()): seg_X, seg_Y = X[X['cluster'] == c], Y[Y['cluster'] == c].ALSFRS_slope print "cluster:", c, "with size:", seg_X.shape, "with mean target:", seg_Y.mean(), "std:", seg_Y.std() seg_Y = seg_Y.fillna(seg_Y.mean()) model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000).fit(seg_X, seg_Y) print "best we can do with all features:", np.sqrt(np.mean((model.predict(seg_X) - seg_Y) ** 2)) selected_fams = set(all_feature_metadata.keys()) selected_derived = set([]) for fam in selected_fams: selected_derived.update([der for der in all_feature_metadata[fam]['derived_features']]) while len(selected_fams) > 6: score_per_family = {} t1 = time.time() for family, fm in all_feature_metadata.iteritems(): if family in selected_fams: X_feature_fam = seg_X[list(selected_derived - set(fm["derived_features"]))] model = RandomForestRegressor(min_samples_leaf=60, random_state=0, n_estimators=1000).fit( X_feature_fam, seg_Y) score_per_family[family] = np.sqrt(np.mean((model.predict(X_feature_fam) - seg_Y) ** 2)) t_lasso_cv = time.time() - t1 worst_fam = sorted(score_per_family.items(), key=operator.itemgetter(1), reverse=True)[0] print "removing worst family:", worst_fam, "time:", t_lasso_cv selected_fams.remove(worst_fam[0]) selected_derived = set([]) for fam in selected_fams: selected_derived.update([der for der in all_feature_metadata[fam]['derived_features']]) best_features_per_cluster[c] = list(selected_fams) return best_features_per_cluster
def do_regression(df, j, i, k): # input is a pandas dataframe with columns as needed below # output is a regression object trained to the data in the input dataframe # convert dataframe info into a vector y = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'count' ].astype(int).values x_1 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'humidity' ].astype(int).values x_2 = df.loc[ (df['workingday'] == j) & (df['Hour'] == i) & (df['Year'] == 2011 + k), 'temp' ].astype(int).values x = zip(x_1, x_2) ## Create linear regression object #regr = linear_model.LinearRegression() # create random forest object, should include all parameters regr = RandomForestRegressor(n_estimators= 100) #forest = DecisionTreeRegressor(max_depth = 4) ## Train the model using the training sets regr.fit(x, y) return regr
def fill_missing_age(df): #把已有的数值型特征取出来丢进Random Forest Regressor 中 age_df = df[['Age','Fare','Parch','SibSp','Pclass']] #print age_df #把乘客分成已知年龄和未知年龄两部分 known_age = age_df[age_df.Age.notnull()].as_matrix() unknown_age = age_df[age_df.Age.isnull()].as_matrix() # print "known_age......." # print known_age # print "unknown age ........" # print unknown_age # 目标年龄 y=known_age[:,0] # 特征属性值 x=known_age[:,1:] #fit 到RandomForestRegressor之中 RFR=RandomForestRegressor(random_state=0,n_estimators=2000,n_jobs=-1) RFR.fit(x,y) #用得到的模型进行未知年龄结果预测 predictedAge= RFR.predict(unknown_age[:,1::]) #用预测的结果填补原缺失数据 df.loc[(df.Age.isnull()),'Age']=predictedAge return df,RFR
def RFscore_one(x,y,id): folds=3 print "RFscore " + id r = range(len(x)) np.random.shuffle(r) x = x[r] y = y[r] x = (x - np.mean(x)) / np.std(x) y = (y - np.mean(y)) / np.std(y) x = np.array(x, ndmin=2) y = np.array(y, ndmin=2) x = x.T y = y.T rf = RandomForestRegressor(n_estimators=50, verbose=0,n_jobs=1,min_samples_split=10,compute_importances=True,random_state=1) fit = rf.fit(x,y) s = fit.score(x,y) cv = cross_validation.KFold(len(x), n_folds=folds, indices=False) score = 0 median = dist(y) for traincv, testcv in cv: fit = rf.fit(x[traincv], y[traincv]) score += fit.score(x[testcv], y[testcv]) score /= folds score /= median return score
def cross_val(seq, ft): n_folds = 10 X, y = load_train_data(seq, ft) print('%d-fold cross validation. Dataset: %d samples, %d features' % (n_folds, X.shape[0], X.shape[1])) kf = KFold(len(y), n_folds=n_folds) n_est = range(30, 110, 20) results = [] for n_estimators in n_est: scores = [] for i, (train, test) in enumerate(kf): rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=mp.cpu_count()) # the (default) score for each regression tree in the ensemble is regression # r2 determination coefficient (e.g., how much variance in y is explained # by the model) # https://www.khanacademy.org/math/probability/regression/regression-correlation/v/r-squared-or-coefficient-of-determination rf.fit(X[train], y[train]) if False: y_pred = rf.predict(X[test]) score = mean_squared_error(y_pred, y[test]) else: score = rf.score(X[test], y[test]) scores.append(score) scores = np.array(scores) print("n_estimators=%d; accuracy (R^2 score): %0.2f (+/- %0.2f)" % (n_estimators, scores.mean(), scores.std() * 2)) results.append([seq, ft, X.shape[0], n_estimators, scores.mean(), scores.std()*2]) return results
def fit(self, X, y, **kwargs): for key, value in kwargs.iteritems(): if key in self.INITPARAMS.keys(): self.INITPARAMS[key] = value model = RandomForestRegressor(**self.INITPARAMS) model.fit(X, y) self.model = model
def main(): train = pd.read_csv('../train.csv', parse_dates=['datetime']) train['hour'] = pd.DatetimeIndex(train['datetime']).hour train['weekday'] = pd.DatetimeIndex(train['datetime']).weekday train['isweekend'] = 0 train.loc[(train['weekday']==5) | (train['weekday']==6), 'isweekend'] = 1 test = pd.read_csv('../test.csv', parse_dates=['datetime']) test['hour'] = pd.DatetimeIndex(test['datetime']).hour test['weekday'] = pd.DatetimeIndex(test['datetime']).weekday test['isweekend'] = 0 test.loc[(test['weekday']==5) | (test['weekday']==6), 'isweekend'] = 1 results = pd.DataFrame(columns=['datetime', 'count']) for hour, test_subset in test.groupby(test['hour']): train_subset = train[train['hour'] == hour] model = RandomForestRegressor(n_estimators=100) model.fit(np.array(get_features(train_subset)), np.array(train_subset['count'])) predictions = model.predict(np.array(get_features(test_subset))) dt = test_subset['datetime'] predictions = pd.Series(predictions, index=dt.index) res = pd.concat([dt, predictions], axis=1) res.columns=['datetime', 'count'] results = pd.concat([results, res]) results['count'] = results['count'].astype('int64') results = results.sort('datetime') results.to_csv('../submissions/seventhSubmission.csv', index=False)
def regression(X_train, y_train, X_test, y_test): """ Train the regressor from Scikit-Learn. """ # Random forest regressor w/ param optimization params = {'n_estimators':1000, 'criterion':'mse', 'max_depth':20, 'min_samples_split':1, #'estimators':400, depth:20 'min_samples_leaf':1, 'max_features':2, 'bootstrap':True, 'oob_score':False, #'max_features':'log2' 'n_jobs':32, 'random_state':0, 'verbose':0, 'min_density':None, 'max_leaf_nodes':None} if config.DEBUG: params['verbose'] = 1 regr = RandomForestRegressor(**params) # Train the model using the training sets regr.fit(X_train, y_train) return regr # Plot the resutls save_semeval_data.plot_results(regr, params, X_test, y_test, feature_names) if config.DEBUG: # Show the mean squared error print("Residual sum of squares: %.2f" % np.mean((regr.predict(X_test) - y_test) ** 2)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % regr.score(X_test, y_test)) return regr
def randomforest(data, targets, num, fnum): """ 7:1205 """ model = RandomForestRegressor(n_estimators=num, verbose=0, oob_score=True, compute_importances=True, n_jobs=10, criterion="mse", max_features=fnum) model.fit(data, targets) return model
class RandomForestModel(Model): """ random forest model """ def __init__(self, *argv, **args): super(RandomForestModel, self).__init__(*argv) self.rf = RandomForestRegressor(**args) def pretreat_feature(self): # pre-handle about the feature data pass def train(self): # train the samples self.rf.fit(self.x, self.y) def assess(self): # assess the regression model error = 0.0 for j in range(len(self.test_x)): pre_val = self.predict(self.test_x[j]) error += (pre_val - self.test_y[j]) ** 2 print 'Training Error: ', error def predict(self, x): # predic the output of the x return self.rf.predict(x) def validate(self): # use cross-validation to choose the best meta-parameter pass
def buildForest(self, X_train, y_train): NUM_TREES = 100 NUM_JOBS = 1 FEATURES_IN_EACH_TREE = "sqrt" rf = RandomForestRegressor(n_estimators=NUM_TREES, verbose=1, n_jobs=NUM_JOBS, max_features=FEATURES_IN_EACH_TREE, oob_score=True, max_depth=25) rf.fit_transform(X_train, y_train) return rf
def main(): fi = open('45-165caps.txt', 'r') symbols = [] for i in fi: symbols.append(i.strip()) #symbols = symbols[0:6] train, test = build_data(symbols, n = 200, flag = 1, blag = 20) train = train.replace([np.inf, -np.inf], np.nan) test = test.replace([np.inf, -np.inf], np.nan) train = train.dropna(axis=0) test = test.dropna(axis=0) #print train.head().T #print test.head().T print 'Fitting\n' m = RandomForestRegressor(n_estimators=500, n_jobs=10) m.fit(train.ix[:,5:], train.ix[:,4]) print 'Predicting\n' preds = m.predict(test.ix[:,4:]) result = test.ix[:,:4] result['Prediction'] = preds result = result.sort('Prediction', ascending=False) print result.head() result.to_csv('trade_result.csv', sep = ',', index = False)
def _create_random_forest(self, current_param={}): combined_param = dict(self.params, **current_param) clf = RandomForestRegressor() clf.set_params(**combined_param) clf = clf.fit(self.Xtr, self.Ytr) return clf
def rf_regressor(self): X = X.toarray() # Convert X from sparse to array X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42) model.fit(X_train, y_train) return model.score(X_test, y_test).round(2)
def do_rf(filename): df, Y = create_merged_dataset(filename) rf = RandomForestRegressor(n_estimators=100) X = df.drop(['driver', 'trip'], 1) rf.fit(X, Y) probs = rf.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def random_learning(labels, train, test): label_log=np.log1p(labels) clf=RandomForestRegressor(n_estimators=50, n_jobs=3) model=clf.fit(train, label_log) preds1=model.predict(test) preds=np.expm1(preds1) return preds
def main(): # read in data, parse into training and target sets cols, train = read_data("../TrainingSet/ACT12_competition_training.csv", 1) target = np.array([x[0] for x in train]) train = filter_cols(train, cols, "../selected/selected_12.txt") # print("Train: ", len(train), " cols:", len(train[0])) train = np.array(train) # In this case we'll use a random forest, but this could be any classifier cfr = RandomForestRegressor(n_estimators=500, max_features=(len(train[0]) // 3), n_jobs=8) # Simple K-Fold cross validation. 10 folds. cv = cross_validation.KFold(len(train), k=5, indices=False) # iterate through the training and test cross validation segments and # run the classifier on each one, aggregating the results into a list results = [] for traincv, testcv in cv: ft = cfr.fit(train[traincv], target[traincv]) pred = ft.predict(train[traincv]) print pred[:10] score = ft.score(train[traincv], target[traincv]) results.append(score) print "\tFold %d: %f" % (len(results), score) # print out the mean of the cross-validated results print "Results: " + str(np.array(results).mean())
def train_with_features(self, features): X = self.data_folder.truncate(self.A, features) rfc = RandomForestRegressor() rfc.fit(X, self.target) return rfc
def test_rrf_vs_sklearn_reg(self): """Test R vs. sklearn on boston housing dataset. """ from sklearn.datasets import load_boston from sklearn.cross_validation import train_test_split from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=13) n_samples, n_features = X_train.shape mtry = int(np.floor(0.3 * n_features)) # do 100 trees r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0, 'mtry': mtry, 'corr.bias': False, 'sampsize': n_samples, 'random_state': 1234}) r_rf.fit(X_train, y_train) y_pred = r_rf.predict(X_test) r_mse = mean_squared_error(y_test, y_pred) p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False, max_features=mtry, random_state=1) p_rf.fit(X_train, y_train) y_pred = p_rf.predict(X_test) p_mse = mean_squared_error(y_test, y_pred) print('%.4f vs %.4f' % (r_mse, p_mse)) # should be roughly the same (7.6 vs. 7.2) np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
def refit_from_scratch(self): """ Create a new model directly from the database, rather than rely on the one saved from last time.""" # In the background fit a much larger random forest. self.threaded_fit = ThreadedFit() self.threaded_fit.signal_finished.connect(self.__init__) self.threaded_fit.start() temp_model = RandomForest(max_features="sqrt", n_jobs=-1) temp_enc = CountVectorizer() X = [] # binary matrix the presence of tags Z = [] # additional numerical data Y = [] # target (to predict) values db_size = self.db.size() for data in self.db.yield_some(250): feedback = data["feedback"] tags = data[ "tags" ] if feedback and tags: Y.append( feedback ) X.append(" ".join(tags)) Z.append(self.fmt_numerical(data)) X = temp_enc.fit_transform(X) X = hstack((X, coo_matrix(Z))) self.allX = X pca = PCA(min(X.shape[0], 200)) reduced_X = pca.fit_transform(X.todense()) temp_model.fit(reduced_X, Y) self.pca = pca self.model = temp_model self.enc = temp_enc
def get_kernel(train_data, test_data, label): #Define forest (n_estimators = number of trees) forest = RandomForestRegressor(n_estimators=1000, warm_start = True) forest = forest.fit(train_data, label) dataset = np.concatenate((train_data, test_data), axis=0) SAMPLE_SIZE = len(dataset) M = 100 #Loop that generates samples of the PDF kernel_list = np.empty([M, SAMPLE_SIZE, SAMPLE_SIZE]) for m in range(M): print("Building partial kernel: {}".format(m)) kernel_list[m,:,:] = get_partial_kernel(forest, dataset) #Average the samples to compute the kernel kernel = np.mean(kernel_list, axis=0) # B = np.zeros((SAMPLE_SIZE, SAMPLE_SIZE)) # I = np.identity(SAMPLE_SIZE) # alpha = 0.1 # for m in range(M): # B += np.linalg.inv(kernel_list[m,:,:] + alpha * I) # B *= M # return B return kernel
def round2(X, y): # Set parameters min_score = {} for tree in [50, 100, 200, 500]: for feature in ['auto', 'log2']: model = RandomForestRegressor(n_estimators=tree, max_features=feature) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate root mean squared error for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) scores.append(rmse) if len(min_score) == 0: min_score['estimator'] = tree min_score['max_feature'] = feature min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['estimator'] = tree min_score['max_feature'] = feature min_score['scores'] = scores print "Estimator:", tree print "Max Features:", feature print scores print np.mean(scores) return min_score
def build_random_forest_regressor(X_test, X_train_full, y_train_full): print "Building random forest regressor..." rf = RandomForestRegressor(n_estimators=800) probas_rf = rf.fit(X_train_full, y_train_full).predict(X_test) return probas_rf
""" from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() X = sc_x.fit_transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, shuffle=True) random_forest = RandomForestRegressor(n_estimators=100) random_forest.fit(X_train, y_train) rand_pred = random_forest.predict(X_test) print('train score for random_forest:', random_forest.score(X_train, y_train)) print('test score for random_forest:', random_forest.score(X_test, y_test)) y_pred = random_forest.predict(X_test) from sklearn.model_selection import cross_val_score clf = RandomForestRegressor() scores = cross_val_score(clf, X_test, y_test, cv=5) scores.mean() #mse in $ mse = mean_absolute_error(y_test, y_pred)
imp_features_model.feature_importances_ # lets plot it #plot graph of feature importances for better visualization feat_importances = pd.Series(imp_features_model.feature_importances_, index=X.columns) feat_importances.nlargest(5).plot(kind='barh') # train test split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) from sklearn.ensemble import RandomForestRegressor RFRModel = RandomForestRegressor() #hyperparameter tuning import numpy as np n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)] from sklearn.model_selection import RandomizedSearchCV #Randomized Search CV # Number of trees in random forest n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree
VotingRegressor) from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from mlprodict.onnxrt import OnnxInference from onnxruntime import InferenceSession from skl2onnx import to_onnx from skl2onnx.tutorial import measure_time N = 11000 X, y = make_regression(N, n_features=10) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.01) print("Train shape", X_train.shape) print("Test shape", X_test.shape) reg1 = GradientBoostingRegressor(random_state=1) reg2 = RandomForestRegressor(random_state=1) reg3 = LinearRegression() ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)]) ereg.fit(X_train, y_train) ################################# # Measure the processing time # +++++++++++++++++++++++++++ # # We use function :func:`skl2onnx.tutorial.measure_time`. # The page about `assume_finite <https://scikit-learn.org/ # stable/modules/generated/sklearn.config_context.html>`_ # may be useful if you need to optimize the prediction. # We measure the processing time per observation whether # or not an observation belongs to a batch or is a single one.
orig_train_X = orig_train[:,:-1] orig_train_y = orig_train[:,-1] test_X = test[:,:-1] test_y = test[:,-1] print('--------') train, val = train_test_split(orig_train, test_size = 0.2) print('train shape : ',train.shape, 'val shape : ',val.shape) train_X = orig_train[:,:-1] train_y = orig_train[:,-1] val_y = test[:,-1] val_X = test[:,:-1] # build model model_lr = LinearRegression() model_svr = SVR() model_rfr = RandomForestRegressor() # train model model_lr = model_lr.fit(train_X, train_y) model_svr = model_svr.fit(train_X, train_y) model_rfr = model_rfr.fit(train_X, train_y) # exit(1) # validate model predict_lr = model_lr.predict(val_X) predict_svr = model_svr.predict(val_X) predict_rfr = model_rfr.predict(val_X) # model selection based on validation AUC score print('LR val MSE score : ', mean_squared_error(val_y, predict_lr), 'SVR val MSE score : ', mean_squared_error(val_y, predict_svr), 'RFR val MSE score : ', mean_squared_error(val_y, predict_rfr))
y = crop_pred_dataset.iloc[:, -1].values #Converting catagorical values ct = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder = 'passthrough') x = ct.fit_transform(x).toarray() #Splitting training and testing dataset x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0) #regression model # regressor = LinearRegression() # regressor.fit(x_train, y_train) # print(regressor.score(x_test,y_test )) randomregressor = RandomForestRegressor(n_estimators=10, random_state=0) randomregressor.fit(x_train, y_train) print(randomregressor.score(x_test,y_test)*100) dic={'Bajra':0.0,'Banana':0.0,'Barley':0.0,'Bean':0.0,'Black pepper':0.0,'Blackgram':0.0,'Bottle Gourd':0.0,'Brinjal':0.0, 'Cabbage':0.0,'Cardamom':0.0,'Carrot':0.0,'Castor seed':0.0,'Cauliflower':0.0,'Chillies':0.0,'Colocosia':0.0,'Coriander':0.0, 'Cotton':0.0,'Cowpea':0.0,'Drum Stick':0.0,'Garlic':0.0,'Ginger':0.0,'Gram':0.0,'Grapes':0.0,'Groundnut':0.0,'Gaur seed':0.0,'Horse-gram':0.0, 'Jowar':0.0,'Jute':0.0,'Khesari':0.0,'Lady Finger':0.0,'Lentil':0.0,'Linseed':0.0,'Maize':0.0,'Mesta':0.0,'Moong':0.0,'Moth':0.0,'Onion':0.0, 'Orange':0.0,'Papaya':0.0,'Peas':0.0,'Pineapple':0.0,'Potato':0.0,'Raddish':0.0,'Ragi':0.0,'Rice':0.0,'Safflower':0.0,'Sannhamp':0.0,'Sesamum':0.0, 'Soyabean':0.0,'Sugarcane':0.0,'Sunflower':0.0,'Sweet potato':0.0,'Tapioca':0.0,'Tomato':0.0,'Turmeric':0.0,'Urad':0.0,'Varagu':0.0,'Wheat':0.0 } lis=list(dic) print('Enter City Name:')
#splitting data into two sets : Training and Testing X_train, X_test, Y_train, Y_test = train_test_split(Final_PUBG, target, test_size=0.33, random_state=0) STD = StandardScaler() X_train = STD.fit_transform(X_train) X_test = STD.transform(X_test) #Trainning Model bp = { 'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 60 } forest = RandomForestRegressor(criterion=bp['criterion'], min_samples_leaf=bp['min_samples_leaf'], min_samples_split=bp['min_samples_split'], max_depth=bp['max_depth'], n_estimators=bp['n_estimators'], verbose=3, n_jobs=2) forest.fit(X_train, Y_train) Y_pred = forest.predict(X_test) # Explained variance score: 1 is perfect prediction print('Score: %.2f' % forest.score(X_test, Y_test)) print(mean_absolute_error(Y_test, Y_pred))
#SPLIT THE COMBINED_DF INTO TRAIN AND TEST SETS, THEN RESET THEIR INDICES train, test = train_test_split(combined_df, test_size=0.2) train.reset_index(inplace = True, drop = True) test.reset_index(inplace = True, drop = True) #CREATE THE X_TRAIN, X_TEST, Y_TRAIN, Y_TEST ARRAYS y_train = np.asarray(train['Next Year Stock Return']) X_train = np.asarray(train.drop(columns = ['Next Year Stock Return'])) y_test = np.asarray(test['Next Year Stock Return']) X_test = np.asarray(test.drop(columns = ['Next Year Stock Return'])) """RANDOM FOREST REGRESSION MODEL""" model = RandomForestRegressor(random_state = 0) model.fit(X_train, y_train) y_train_predict = model.predict(X_train) plt.scatter(y_train, y_train_predict) plt.show() #MSE OF THE TRAINING SET mse_train = np.mean(np.square(np.subtract(y_train, y_train_predict))) print("Training set Mean Squared Error: {}".format(mse_train)) y_test_predict = model.predict(X_test) plt.scatter(y_test, y_test_predict) plt.show()
r2_store, mse_store, mcc_store, f1_store = [], [], [], [] # Empty lists for storing results mse_bins_store = [] # Monte Carlo cross validation (MCCV) loop for rrr in range(50): # Resample validation set (uniform distribution) train_indices, test_indices = resreg.uniform_test_split(X, y, bins=bins, bin_test_size=70, verbose=False, random_state=rrr) X_train, y_train = X[train_indices,:], y[train_indices] X_test, y_test = X[test_indices,:], y[test_indices] # Unpack hyperparameters, resample training data, and fit regressors reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \ RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr) if strategy=='RO': cl, ch, sample_method = param relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch) X_train, y_train = resreg.random_oversample(X_train, y_train, relevance, relevance_threshold=0.5, over=sample_method, random_state=rrr) reg.fit(X_train, y_train) elif strategy=='SMOTER': cl, ch, sample_method, k = param relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch) X_train, y_train = resreg.smoter(X_train, y_train, relevance, relevance_threshold=0.5, k=k, over=sample_method, random_state=rrr)
'C:/Users/wybek/Documents/school/Master/Information Retrieval/project2/data/STR_features.csv', index_col=0) #Features from our data #features_file = pd.read_csv('./data/Our_LTR.csv', index_col=0) #features_file = pd.read_csv('./data/Our_STR.csv', index_col=0) features_file = features_file.set_index('table_id') #load the qrels dictionary with open( 'C:/Users/wybek/Documents/school/Master/Information Retrieval/project2/data/qrels_dict.pickle', 'rb') as handle: qrels = pickle.load(handle) regression_model = RandomForestRegressor(n_estimators=1000, max_leaf_nodes=4) #get a list from 1 to 60 which will be split up for k fold cross validation queries = features_file['query_id'].drop_duplicates() queries = queries.tolist() # First count the number of relevant tables in corpus for each query. num_of_relevant_tables = {} for query_index, query_string in enumerate(queries): qrels_one_query = qrels[query_index + 1] num_of_relevant_tables[query_string] = 0 for t in qrels_one_query: if (qrels_one_query[t] > 0): num_of_relevant_tables[query_string] += 1 kf = KFold(n_splits=5, random_state=2, shuffle=True)
def main(): df = pd.read_csv(FILE) # Training and testing split y, X = df.iloc[:, 0].values, df.iloc[:, 1:-2].values Dcon = X.shape[1] # Onehot data soil = pd.get_dummies(df.soil) landuse = pd.get_dummies(df.landuse, drop_first=True) # Binary Dcat = soil.shape[1] + landuse.shape[1] # Stack all X X = np.hstack((X, soil, landuse)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=TEST_PROPORTION, random_state=SEED) N, D = X_train.shape # NOTE: Subsetting if SUBSET: rand = np.random.RandomState(SEED) inds = rand.choice(N, int(N * SUBSET_PROPORTION), replace=False) X_train, y_train = X_train[inds], y_train[inds] # Initialise estimator ss = StandardScaler() # Kernel l = LENSCALE if SCALAR else LENSCALE * np.ones(Dcon) kern = RBF(Dcon, lengthscales=l, active_dims=np.arange(Dcon), ARD=~SCALAR)\ + Linear(Dcat, active_dims=np.arange(Dcon, Dcon + Dcat), ARD=~SCALAR)\ + White(D) # GP if SPARSE_GP: gp = SparseGP(kern, fix_inducing=FIX_INDUCING, n_inducing=N_INDUCING) else: gp = GP(kern) # Random Forest rf = RandomForestRegressor(n_estimators=10, random_state=SEED) # SVM # http://scikit-learn.org/stable/modules/svm.html clf = SVR(C=1.0, epsilon=0.2, kernel='rbf') # Linear model br = BayesianRidge() models = {'GP': gp, 'RandomForest': rf, 'SVM': clf, 'BayesianRidge': br} for name, mod in models.items(): print("Fitting {}...".format(name)) if name == 'RandomForest': model = mod else: model = make_pipeline(ss, mod) # Train model.fit(X_train, y_train) # Predict if name == 'GP': Ey, Sy = model.predict(X_test) else: Ey = model.predict(X_test) # Validate r2 = r2_score(y_test, Ey) mse = mean_squared_error(y_test, Ey) rmse = np.sqrt(mse) evs = explained_variance_score(y_test, Ey) if name == 'GP': nlp = negative_log_proba(y_test, Ey, Sy) else: nlp = np.inf print("{} Results:".format(name)) print("R2 = {}\nMSE = {}\nRMSE = {}\nEVS = {}\nNLP = {}".format( r2, mse, rmse, evs, nlp)) if name == 'GP': print("Kernel parameters:") print(gp.kernel)
data_train_scaled = scaler.fit_transform(data_train[valid_feature]) data_test_scaled = scaler.fit_transform(data_test[valid_feature]) # 降维 pca = PCA(n_components=15) # data_train_pca = pca.fit_transform(data_train_scaled) # 降维后数据 # data_test_pca = pca.fit_transform(data_test_scaled) # 模型评估 print("========= Modeling =========") # 进行模型交叉验证 models = [ LinearRegression(), Ridge(), Lasso(alpha=0.01, max_iter=10000), RandomForestRegressor(n_estimators=400), GradientBoostingRegressor(), SVR(), LinearSVR(), ElasticNet(alpha=0.001, max_iter=10000), SGDRegressor(max_iter=1000, tol=1e-3), BayesianRidge(), KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5), ExtraTreesRegressor() ] # for model in models: # evalUtil.model_rmse_log(model,data_train_scaled,target_train) # 模型调参 # model = RandomForestRegressor()
wandb.init(project="Airbnb Tuning", name='RF 8', notes='dataset_1, robust scaler') data = pd.read_csv( 'C:/Users/delim/Desktop/AI in A&F Indiv Assignment/Practical Assessment/Prediction/dataset_8.csv' ) # Train test split X = data.iloc[:, data.columns != 'price'] Y = data['price'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7) # Standardize dataset scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) model = RandomForestRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) wandb.sklearn.plot_regressor(model, X_train, X_test, y_train, y_test) wandb.sklearn.plot_outlier_candidates(model, X_train, y_train) wandb.sklearn.plot_residuals(model, X_train, y_train) RMSE = sqrt(mean_squared_error(y_test, y_pred)) MSE = mean_squared_error(y_test, y_pred) MAE = mean_absolute_error(y_test, y_pred) wandb.log({"RMSE": RMSE, "MSE": MSE, "MAE": MAE})
def main(): boston = loadData() X = boston.data Y = boston.target X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.25, random_state=33) print('The max target value is', np.max(boston.target)) print('The min target value is ', np.min(boston.target)) print('The average target value is ', np.mean(boston.target)) # 对数据进行标准化处理 ss_X = StandardScaler() ss_Y = StandardScaler() X_train = ss_X.fit_transform(X_train) X_test = ss_X.transform(X_test) Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1)) Y_test = ss_Y.transform(Y_test.reshape(-1, 1)) # 导入线性回归模型并训练 lr = LinearRegression() lr.fit(X_train, Y_train) lr_Y_predict = lr.predict(X_test) # 导入SDGR模型并训练 sgdr = SGDRegressor() sgdr.fit(X_train, Y_train.ravel()) sgdr_Y_predict = sgdr.predict(X_test) # 评估模型性能 进行对比 发现模型自带评价score等价于r2_score print('-----------------------------------------------------------------------') print('The value of default measurement of LinerRegression is ', lr.score(X_test, Y_test)) print('The value of R-squared of LinerRegression is ', r2_score(Y_test, lr_Y_predict)) print('The mean squared error of LinerRegression is ', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict))) print('The mean absolute error of LinerRegression is ', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict))) print('-----------------------------------------------------------------------') print('The value of default measurement of SGDRession is ', sgdr.score(X_test, Y_test)) print('The value of R-squared of SGDRession is ', r2_score(Y_test, sgdr_Y_predict)) print('The mean squared error of SGDRession is ', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict))) print('The mean absolute error of SGDRession is ', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict))) # SVM Regression # 线性核函数SVR liner_svr = SVR(kernel='linear') liner_svr.fit(X_train, Y_train.ravel()) liner_svr_y_predict = liner_svr.predict(X_test) # 多项式核函数SVR poly_svr = SVR(kernel='poly') poly_svr.fit(X_train, Y_train.ravel()) poly_svr_y_predict = poly_svr.predict(X_test) # 径向基核函数SVR rbf_svr = SVR(kernel="rbf") rbf_svr.fit(X_train, Y_train.ravel()) rbf_svr_y_predict = rbf_svr.predict(X_test) # 对三种核函数的SVR进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of linear SVR is:', liner_svr.score(X_test, Y_test)) print('The MSE of linear SVR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict))) print('The MAE of linear SVR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict))) print('-----------------------------------------------------------------------') print('R-square value of poly SVR is:', poly_svr.score(X_test, Y_test)) print('The MSE of poly SVR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict))) print('The MAE of poly SVR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict))) print('-----------------------------------------------------------------------') print('R-square value of rbf SVR is:', rbf_svr.score(X_test, Y_test)) print('The MSE of rbf SVR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict))) print('The MAE of rbf SVR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict))) # 两种K近邻模型 # 预测方式:平均回归 uni_knr = KNeighborsRegressor(weights='uniform') uni_knr.fit(X_train, Y_train.ravel()) uni_knr_y_predicrt = uni_knr.predict(X_test) # 预测方式:距离加权 dis_knr = KNeighborsRegressor(weights='distance') dis_knr.fit(X_train, Y_train.ravel()) dis_knr_y_predict = dis_knr.predict(X_test) # 对两种k近邻模型进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of uniform-weighted KNR is:', uni_knr.score(X_test, Y_test)) print('The MSE of uniform-weighted KNR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt))) print('The MAE of uniform-weighted KNR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt))) print('-----------------------------------------------------------------------') print('R-square value of distance-weighted KNR is:', dis_knr.score(X_test, Y_test)) print('The MSE of distance-weighted KNR is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict))) print('The MAE of distance-weighted KNR is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict))) # 使用回归树模型 dtr = DecisionTreeRegressor() dtr.fit(X_train, Y_train.ravel()) dtr_y_predict = dtr.predict(X_test) # 对回归树进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of DecisionTreeRegressor is:', dtr.score(X_test, Y_test)) print('The MSE of DecisionTreeRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict))) print('The MAE of DecisionTreeRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict))) # 使用三种集成模型进行训练 rfr = RandomForestRegressor() rfr.fit(X_train, Y_train.ravel()) rfr_y_predict = rfr.predict(X_test) etr = ExtraTreesRegressor() etr.fit(X_train, Y_train.ravel()) etg_y_predict = etr.predict(X_test) gbr = GradientBoostingRegressor() gbr.fit(X_train, Y_train.ravel()) gbr_y_predict = gbr.predict(X_test) # 对三种集成模型进行性能评估 print('-----------------------------------------------------------------------') print('R-square value of RandomForestRegressor is:', rfr.score(X_test, Y_test)) print('The MSE of RandomForestRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict))) print('The MAE of RandomForestRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict))) print('-----------------------------------------------------------------------') print('R-square value of ExtraTreesRegressor is:', etr.score(X_test, Y_test)) print('The MSE of ExtraTreesRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict))) print('The MAE of ExtraTreesRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict))) print(np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0)) print('-----------------------------------------------------------------------') print('R-square value of GradientBoostingRegressor is:', gbr.score(X_test, Y_test)) print('The MSE of GradientBoostingRegressor is:', mean_squared_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict))) print('The MAE of GradientBoostingRegressor is:', mean_absolute_error( ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict)))
from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import roc_auc_score import pandas as pd X = pd.read_csv('data/titanic.csv') y = X.pop('Survived') X['Age'].fillna(X.Age.mean(), inplace=True) # tots els forats els canvies per el promig numeric_variables = list(X.dtypes[ X.dtypes != "object"].index) #agafa nomes les comunes que tenen valors model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42) model.fit(X[numeric_variables], y) model.oob_score_ y_oob = model.oob_prediction_ print "c-stat: ", roc_auc_score(y, y_oob) X.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True) def clean_cabin(x): try: return x[0] except TypeError: return 'None'
accuracy_ANN = 1 - result print("Accuracy : {}".format(accuracy_ANN)) epochs_hist.history.keys() plt.plot(epochs_hist.history['loss']) plt.title('Model Loss Progress During Training') plt.xlabel('Epoch') plt.ylabel('Training Loss') plt.legend(['Training Loss'])\ from sklearn.tree import DecisionTreeRegressor DecisionTree_model = DecisionTreeRegressor() DecisionTree_model.fit(X_train,y_train) accuracy_DecisionTree = DecisionTree_model.score(X_test,y_test) accuracy_DecisionTree from sklearn.ensemble import RandomForestRegressor RandomForest_model = RandomForestRegressor(n_estimators=100, max_depth =10) RandomForest_model.fit(X_train,y_train) accuracy_RandomForest = RandomForest_model.score(X_train,_train) accuracy_RandomForest Read about regrtession metrics : 1+cam photos (mean absolute eroor ,mean square error,RMS error) y_predict = LinearREgression_model.predict(X_test) plt.plot(y_test,y_predict, '>',color ='r') y_predict_orig = scaler_y.inverse_transform(y_predict) y_test_orig = scaler_y.inverse_transform(y_test) pit.plot(y_test_orig,y_predict_orig,'>',color ='r') k = X_test.shape[1] 3n = len(X_test) N from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
def reconstructRF(): """ run KFOLD method for random forest regression """ #import packages import os import numpy as np import pandas as pd #from sklearn import metrics #from scipy import stats #import seaborn as sns #import matplotlib.pyplot as plt #from sklearn.model_selection import KFold from datetime import datetime from sklearn.ensemble import RandomForestRegressor from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" # #load KFOLD result csv file # os.chdir('F:\\06_eraint_results\\sonstig') # kf_dat = pd.read_csv('eraint_randForest_kfold.csv') # #edit the tg names to be usable later on # editName = lambda x: x.split('.csv')[0] # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg']) #cd to the lagged predictors directory os.chdir(dir_in) x = 453 y = 454 #looping through for tg in range(x,y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') pred_surge.sort_values(by = 'date', inplace = True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA #get the number of PCs used during validation # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs'] pca = PCA(0.95) pca.fit(X) X_pca = pca.transform(X) {# #apply 10 fold cross validation # kf = KFold(n_splits=10, random_state=29) # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) # for train_index, test_index in kf.split(X): # X_train, X_test = X_pca[train_index], X_pca[test_index] # y_train, y_test = y['surge'][train_index], y['surge'][test_index] # #train regression model # rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1) # lm.fit(X_train, y_train) # #predictions # predictions = lm.predict(X_test) # # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # # pd.DataFrame(np.array(y_test))], \ # # axis = 1) # # pred_obs.columns = ['pred', 'obs'] # # combo = pd.concat([combo, pred_obs], axis = 0) # #evaluation matrix - check p value # if stats.pearsonr(y_test, predictions)[1] >= 0.05: # print("insignificant correlation!") # continue # else: # #print(stats.pearsonr(y_test, predictions)) # metric_corr.append(stats.pearsonr(y_test, predictions)[0]) # #print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # #number of years used to train/test model # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\ # pred_surge['date'][0]).days/365) } longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components # corr = np.mean(metric_corr) # rmse = np.mean(metric_rmse) # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\ # np.mean(metric_corr), ' - avg_rmse (m) = ', \ # np.mean(metric_rmse), '\n') #%% #surge reconstruction pred_for_recon = pred[~pred.isna().any(axis = 1)] pred_for_recon = pred_for_recon.reset_index().drop('index', axis = 1) #standardize predictor data dat = pred_for_recon.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred_for_recon['date'], dat_standardized], axis = 1) X_recon = pred_standardized.iloc[:, 1:] #apply PCA pca = PCA(num_pc) #use the same number of PCs used for training pca.fit(X_recon) X_pca_recon = pca.transform(X_recon) #%% #model preparation #defining the rf model with number of trees and minimum leaves rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \ random_state = 29) rf.fit(X_pca, y) #get prediction interval def pred_ints(model, X_pca_recon, percentile = 95): """ function to construct prediction interval taking into account the result of each regression tree """ err_down = []; err_up = []; preds= []; for pred in model.estimators_: preds.append(pred.predict(X_pca_recon)) preds = np.vstack(preds).T err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \ keepdims = True) err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \ keepdims = True) return err_down.reshape(-1), err_up.reshape(-1) #compute 95% prediction intervals err_down, err_up = pred_ints(rf, X_pca_recon, percentile = 95); #reconstructed surge goes here truth = rf.predict(X_pca_recon); correct = 0.; for i, val in enumerate(truth): if err_down[i] <= val <= err_up[i]: correct +=1 print(correct*100/len(truth), '\n') #final dataframe final_dat = pd.concat([pred_standardized['date'], \ pd.DataFrame([truth, err_down, err_up]).T], axis = 1) final_dat['lon'] = longitude final_dat['lat'] = latitude final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\ 'pred_int_upper', 'lon', 'lat'] {#plot - optional # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date']) # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date']) # sns.set_context('notebook', font_scale = 2) # plt.figure() # plt.plot(final_dat['date'], final_dat['mean'], color = 'green') # plt.scatter(surge['date'], surge['surge'], color = 'blue') #prediction intervals # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red', linestyle = "--", lw = 0.8) #confidence intervals # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black', linestyle = "--", lw = 0.8) } #save df as cs - in case of interruption os.chdir(dir_out) final_dat.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='neg_mean_squared_error')).mean()) regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)]) print('r2 = %.2f' % cross_val_score( regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean()) print('rmse = %.2f' % np.sqrt(-1 * cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='neg_mean_squared_error')).mean()) print('=== Random Forest ===') regr = RandomForestRegressor(max_depth=2, random_state=0) print('r2 = %.2f' % cross_val_score( regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean()) print('rmse = %.2f' % np.sqrt(-1 * cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='neg_mean_squared_error')).mean()) regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)]) print('r2 = %.2f' % cross_val_score( regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean()) print('rmse = %.2f' % np.sqrt(-1 * cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:],
# n_iter_search = 50 # regr1 = RandomizedSearchCV(regr1, param_distributions=param_dist, # n_iter=n_iter_search, cv=3,n_jobs=-1) # regr1.fit(x_train, y_train) # print(regr1.best_params_) # output - {'n_estimators': 200, 'min_samples_split': 3,'max_depth=40'} # In[5]: from sklearn.ensemble import RandomForestRegressor model_rf = RandomForestRegressor(n_jobs=-1,min_samples_split=3,n_estimators=200,max_depth=40) model_rf.fit(x_train,y_train) # import joblib # joblib.dump(model_rf,"D:/python pycharm/Imarticus _Project/model_rf.pkl") y_pred_train_rf = model_rf.predict(x_train) y_pred_test_rf = model_rf.predict(x_test) from sklearn.metrics import mean_squared_error rmse_train_rf = np.sqrt(mean_squared_error(y_train,y_pred_train_rf)) rmse_test_rf = np.sqrt(mean_squared_error(y_test,y_pred_test_rf)) print("RMSLE value of Training Data is {a}".format(a=rmse_train_rf))
def correct_with_regression(df_load, dikt_errors, prefix=None, prefix_plot=None, bool_plot_corrections=None, bool_plot_trash=None): """ Learn a predictor for each bad site to predict the irrelevant values from the values of the other sites that do not have irrelevant values """ print('correct_with_regression - ', end='') fname_load = os.path.join(prefix, 'df_corrected_load.csv') fname_trash = os.path.join(prefix, 'trash_sites.pkl') try: df_corrected_load = pd.read_csv( fname_load, index_col=0, #header = [0], ) df_corrected_load.index = pd.to_datetime(df_corrected_load.index) with open(fname_trash, 'rb') as f: trash_sites = pickle.load(f) print('Loaded df_corrected_load and trash_sites') except Exception as e: print('\n{0}'.format(colored(e, 'red'))) print('df_corrected_load not loaded') bad_sites = sorted( set([site for k, v in dikt_errors.items() for site in v])) df_corrected_load = df_load.copy() trash_sites = [] X = df_load[sorted(set(df_load.columns) - set(bad_sites))] assert not pd.isnull(X).sum().sum() for ii, site in enumerate(bad_sites): print('\r{0:6} / {1:6} - '.format(ii, len(bad_sites)), end='') y = df_load[site] flags = { dd: error_type for error_type in dikt_errors for ii, dd in dikt_errors[error_type].get(site, []) } samples_unkown = [ (ii, dd) for error_type in dikt_errors for ii, dd in dikt_errors[error_type].get(site, []) ] ind_unknown, dates_unknown = list(zip(*samples_unkown)) ind_unknown = sorted(ind_unknown) dates_unknown = sorted(dates_unknown) ind_known = [ ii for ii in range(y.shape[0]) if ii not in ind_unknown ] # Indices corresponding to sane observations assert not pd.isnull(y.iloc[ind_known]).sum() if len(ind_known) == 0: trash_sites.append((site, 'dates_known empty')) df_corrected_load = df_corrected_load.drop(site, axis=1) print('{0:6} -> drop because dates known empty'.format(site)) continue shuffled_ind_known = ind_known.copy() np.random.shuffle(shuffled_ind_known) cut = int(0.9 * len(shuffled_ind_known)) # Divide the sane observations into a training and a test sets ind_train = sorted(shuffled_ind_known[:cut]) ind_test = sorted(shuffled_ind_known[cut:]) # Train y_train = y.iloc[ind_train] X_train = X.iloc[ind_train] # Validation y_test = y.iloc[ind_test] X_test = X.iloc[ind_test] # Pred X_pred = X.iloc[ind_unknown] # Normalization covariates X_mean = X_train.mean(axis=0) X_std = X_train.std(axis=0) X_train = (X_train - X_mean) / X_std X_test = (X_test - X_mean) / X_std X_pred = (X_pred - X_mean) / X_std # Normalization target y_mean = y_train.mean(axis=0) y_std = y_train.std(axis=0) y_train = (y_train - y_mean) / y_std assert np.allclose(X_train.sum(), 0) assert np.allclose(y_train.sum(), 0) regressor = 'rf' # 'rf' # 'xgb' # 'spams' # Assess the quality of a predictor from the other sane sites # We de not have a criteria to decide which algorithms is the most # appropriate and have used alternatively spams of random forests. if regressor == 'rf': model = RandomForestRegressor() model.fit(X_train, y_train) y_hat_train = model.predict(X_train) y_hat_test = model.predict(X_test) y_hat_pred = model.predict(X_pred) elif regressor == 'xgb': model = xgb.XGBRegressor() model.fit(X_train, y_train) y_hat_train = model.predict(X_train) y_hat_test = model.predict(X_test) y_hat_pred = model.predict(X_pred) elif regressor == 'spams': hprm = { 'loss': 'square', 'numThreads': -1, 'verbose': False, 'lambda1': 0.03 * X_train.shape[0], 'lambda2': 0.1, # For elastic_net 'it0': 10, # nb_iter between two dual gap computations 'max_it': int( 1e4 ), # (optional, maximum number of iterations, 100 by default) 'L0': 0.1, # (optional, initial parameter L in fista, 0.1 by default, should be small enough) 'regul': 'l2', 'tol': 1e-4, 'intercept': False, #(optional, do not regularize last row of W, false by default) 'compute_gram': True, 'return_optim_info': True } beta0 = np.zeros( (X_train.shape[1], 1), dtype=np.float64, order="F", ) beta_cen, optim_info = spams.fistaFlat( np.asfortranarray(y_train, dtype=np.float64).reshape( (-1, 1)), np.asfortranarray(X_train, dtype=np.float64), beta0, **hprm, ) beta = beta_cen[:, 0] y_hat_train = X_train @ beta y_hat_test = X_test @ beta y_hat_pred = X_pred @ beta y_train = y_train * y_std + y_mean y_hat_train = y_hat_train * y_std + y_mean y_hat_test = y_hat_test * y_std + y_mean y_hat_pred = y_hat_pred * y_std + y_mean rr_train = 1 - ( (y_train - y_hat_train)**2).mean() / y_train.std()**2 rr_test = 1 - ((y_test - y_hat_test)**2).mean() / y_test.std()**2 if not ( rr_train > 0.9 and rr_test > 0.5 ): # If the performances are not good enough on the training and the test sets, drop the site trash_sites.append(( site, 'rr_train = {rr_train:.2} - rr_test = {rr_test:.2}'.format( rr_train=rr_train, rr_test=rr_test, ))) df_corrected_load = df_corrected_load.drop(site, axis=1) print( '{0:6} -> drop because prediction not good enough - rr_train = {rr_train:.2} - rr_test = {rr_test:.2}' .format( site, rr_train=rr_train, rr_test=rr_test, )) continue if bool_plot_corrections: plot_tools.plot_corrections( y, dates_unknown, y_hat_pred, os.path.join( prefix_plot, 'corrections', ), regressor, rr_test, flags, ) print( '{0:6} -> {1:5} values corrected - rr_train = {rr_train:.2} - rr_test = {rr_test:.2}' .format( site, len(ind_unknown), rr_train=rr_train, rr_test=rr_test, )) df_corrected_load[site].iloc[ind_unknown] = y_hat_pred df_corrected_load.to_csv(fname_load) with open(fname_trash, 'wb') as f: pickle.dump(trash_sites, f) if bool_plot_trash: plot_tools.plot_trash( trash_sites, df_load, os.path.join( prefix_plot, 'trash_sites', ), ) # Plot the sites that are discarded print( 'done - df_corrected_load.shape = {0} - len(trash_sites) = {1}\n{2}'. format(df_corrected_load.shape, len(trash_sites), '#' * tools.NB_SIGNS), ) return df_corrected_load, trash_sites
""" Train an RF regressor In the following exercises you'll predict bike rental demand in the Capital Bikeshare program in Washington, D.C using historical weather data from the Bike Sharing Demand dataset available through Kaggle. For this purpose, you will be using the random forests algorithm. As a first step, you'll define a random forests regressor and fit it to the training set. The dataset is processed for you and split into 80% train and 20% test. The features matrix X_train and the array y_train are available in your workspace. """ # Import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor # Instantiate rf rf = RandomForestRegressor(n_estimators=25, random_state=2) # Fit rf to the training set rf.fit(X_train, y_train) """ Evaluate the RF regressor You'll now evaluate the test set RMSE of the random forests regressor rf that you trained in the previous exercise. The dataset is processed for you and split into 80% train and 20% test. The features matrix X_test, as well as the array y_test are available in your workspace. In addition, we have also loaded the model rf that you trained in the previous exercise. """ # Import mean_squared_error as MSE from sklearn.metrics import mean_squared_error as MSE
import numpy as np from sklearn.ensemble import RandomForestRegressor from scripts.learn import machineLearning from scripts.learn.machineLearning import tuneNValue # ______________________________________________________ # HOW LONG WILL MY CLASSIFIER TAKE TO RUN ON MY MACHINE? # ______________________________________________________ # # Create a basic classifier classifier = RandomForestRegressor(random_state=43) classifierName = "randomForest" # No spaces, this will be a file name # Add the names of all data files you want to use to this list jsonFileNames = [ 'chicago_weather_sentiment_clean_grouped.json', 'denver_weather_sentiment_clean_grouped.json', 'detroit_weather_sentiment_clean_grouped.json', 'houston_weather_sentiment_clean_grouped.json', 'manhattan_weather_sentiment_clean_grouped.json', 'phoenix_weather_sentiment_clean_grouped.json', 'sanFrancisco_weather_sentiment_clean_grouped.json', 'seattle_weather_sentiment_clean_grouped.json', ] # We want to see how long it will take to train our classifier # This will make a file called <classifierName>_number_data_points.csv def tuneRandomForestNValues(): nValues = [
def _miss_forest(self, Ximp, mask): """The missForest algorithm""" # Count missing per column if isinstance(Ximp, pd.DataFrame): Ximp = Ximp.values col_missing_count = mask.sum(axis=0) # Get col and row indices for missing missing_rows, missing_cols = np.where(mask) rf_regressor = rf_classifier = n_catmissing = None if self.num_idx.size: # Only keep indices for numerical vars keep_idx_num = np.in1d(missing_cols, self.num_idx) missing_num_rows = missing_rows[keep_idx_num] missing_num_cols = missing_cols[keep_idx_num] # Make initial guess for missing values col_means = np.full(Ximp.shape[1], fill_value=np.nan) col_means[self.num_idx] = deepcopy( self.statistics_.get('col_means')) Ximp[missing_num_rows, missing_num_cols] = np.take(col_means, missing_num_cols) # Reg criterion reg_criterion = self.criterion if type(self.criterion) == str \ else self.criterion[0] # Instantiate regression model rf_regressor = RandomForestRegressor( n_estimators=self.n_estimators, criterion=reg_criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, oob_score=self.oob_score, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose, warm_start=self.warm_start) # If needed, repeat for categorical variables if self.cat_idx.size: # Calculate total number of missing categorical values (used later) n_catmissing = np.sum(mask[:, self.cat_idx]) # Only keep indices for categorical vars keep_idx_cat = np.in1d(missing_cols, self.cat_idx) missing_cat_rows = missing_rows[keep_idx_cat] missing_cat_cols = missing_cols[keep_idx_cat] # Make initial guess for missing values col_modes = np.full(Ximp.shape[1], fill_value=np.nan) col_modes[self.cat_idx] = self.encoded_col_modes[self.cat_idx] Ximp[missing_cat_rows, missing_cat_cols] = np.take(col_modes, missing_cat_cols) # Classfication criterion clf_criterion = self.criterion if type(self.criterion) == str \ else self.criterion[1] # Instantiate classification model rf_classifier = RandomForestClassifier( n_estimators=self.n_estimators, criterion=clf_criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, oob_score=self.oob_score, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose, warm_start=self.warm_start, class_weight=self.class_weight) # 2. misscount_idx: sorted indices of cols in X based on missing count misscount_idx = np.argsort(col_missing_count) # Reverse order if decreasing is set to True if self.decreasing is True: misscount_idx = misscount_idx[::-1] # 3. While new_gammas < old_gammas & self.iter_count_ < max_iter loop: self.iter_count_ = 0 gamma_new = 0 gamma_old = np.inf gamma_newcat = 0 gamma_oldcat = np.inf col_index = np.arange(Ximp.shape[1]) while ( gamma_new < gamma_old or gamma_newcat < gamma_oldcat) and \ self.iter_count_ < self.max_iter: # 4. store previously imputed matrix Ximp_old = deepcopy(Ximp) if self.iter_count_ != 0: gamma_old = gamma_new gamma_oldcat = gamma_newcat # 5. loop for s in misscount_idx: # Column indices other than the one being imputed s_prime = np.delete(col_index, s) # Get indices of rows where 's' is observed and missing obs_rows = np.where(~mask[:, s])[0] mis_rows = np.where(mask[:, s])[0] # If no missing, then skip if len(mis_rows) == 0: continue # Get observed values of 's' yobs = Ximp[obs_rows, s] # Get 'X' for both observed and missing 's' column xobs = Ximp[np.ix_(obs_rows, s_prime)] xmis = Ximp[np.ix_(mis_rows, s_prime)] # 6. Fit a random forest over observed and predict the missing if self.cat_idx is not None and s in self.cat_idx: yobs = yobs.astype('int32') rf_classifier.fit(X=xobs, y=yobs) # 7. predict ymis(s) using xmis(x) ymis = rf_classifier.predict(xmis) # 8. update imputed matrix using predicted matrix ymis(s) Ximp[mis_rows, s] = ymis else: yobs = yobs.astype('float32') rf_regressor.fit(X=xobs, y=yobs) # 7. predict ymis(s) using xmis(x) ymis = rf_regressor.predict(xmis) # 8. update imputed matrix using predicted matrix ymis(s) Ximp[mis_rows, s] = ymis # 9. Update gamma (stopping criterion) if self.cat_idx is not None: gamma_newcat = np.sum( (Ximp[:, self.cat_idx] != Ximp_old[:, self.cat_idx])) / n_catmissing if self.num_idx is not None: gamma_new = np.sum( (Ximp[:, self.num_idx] - Ximp_old[:, self.num_idx])** 2) / np.sum((Ximp[:, self.num_idx])**2) logger.debug(f"MissForest Coverage Iteration: {self.iter_count_}") self.iter_count_ += 1 return Ximp
poly_reg.fit(X_poly, Y_train) lin_reg_2 = LinearRegression() lin_reg_2.fit(X_poly, Y_train)''' from sklearn.model_selection import cross_val_score from sklearn.svm import SVR model2 = SVR(kernel='rbf') from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score from sklearn.tree import DecisionTreeRegressor model3 = DecisionTreeRegressor(random_state=50) from sklearn.ensemble import RandomForestRegressor model4 = RandomForestRegressor(n_estimators=300, random_state=50) from xgboost import XGBRegressor model5 = XGBRegressor() from sklearn.neighbors import KNeighborsRegressor model6 = KNeighborsRegressor(n_neighbors=15) from sklearn.linear_model import Ridge, Lasso model7 = Ridge() model8 = Lasso() from vecstack import stacking
columns=[KOI_DISPOSITION, KOI_PDISPOSITION], drop_first=True) #------------------------------------------------------------------------------ #------------------------------------------------------------------------------ # #Test and train data splitting y = data[KOI_SCORE] x = data.drop([KOI_SCORE], axis=1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=9) # #Definition of the models to include in the analysis ad optimization process models = { "RandomForest": RandomForestRegressor(n_estimators=200), "Gradient Boosting": GradientBoostingRegressor(), #"K Neighbors": KNeighborsRegressor(), #"Decision Tree": DecisionTreeRegressor(), "Neural Network": MLPRegressor((20, 20, 20), max_iter=1000, random_state=1) } train_accuracies = pd.DataFrame(index=models.keys(), columns=[AVERAGE, PCT_STANDARD_DEVIATION]) test_accuracies = pd.DataFrame(index=models.keys(), columns=[AVERAGE, PCT_STANDARD_DEVIATION]) # #------------------------------------------------------------------------------ for model_name, model in models.items(): cv = cross_validate(model, x, y, cv=50, n_jobs=-1) avg_train_accuracy = (np.mean(
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values) #target column i.e price range # RF using Normalisation | accuracy = 91.45863 # n_estimators = 50, log2, acc = 91.46870 # n_estimators = 50, log2, acc = 91.48331 - considering time from sklearn.ensemble import RandomForestRegressor x = st.slider('Choose number of estimators for Random Forest Algorithm', min_value=10, max_value=150) regressorRF_Norm = RandomForestRegressor(n_estimators=x, random_state=0, max_features="log2", oob_score=True) if st.button('Train Random Forest model'): regressorRF_Norm.fit(train_normalized, y) y_pred_rf_Norm = regressorRF_Norm.predict(test_normalized) st.subheader('Predictions are:') st.write(y_pred_rf_Norm) # #out_norm = pd.DataFrame(y_pred_rf_Norm,columns=['air_pollution_index']) #out_norm.to_csv('submission_norm_t.csv',sep=',') # ends here
def score_dataset(X_train, X_valid, y_train, y_valid): model = RandomForestRegressor(n_estimators=10, random_state=0) model.fit(X_train, y_train) preds = model.predict(X_valid) return mean_absolute_error(y_valid, preds)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) # Modelos de AI # - Regressão Linear # - RandomForest (Árvore de Decisão) from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn import metrics lin_reg = LinearRegression() rf_reg = RandomForestRegressor() rf_reg.fit(x_train, y_train) lin_reg.fit(x_train, y_train) from sklearn import metrics # R² --> 0% --- 100% test_pred_lin = lin_reg.predict(x_test) test_pred_rf = rf_reg.predict(x_test) r2_lin = metrics.r2_score(y_test, test_pred_lin) mse_lin = metrics.mean_squared_error(y_test, test_pred_lin) print(f"R² da Regressão Linear: {r2_lin}") print(f"MSE da Regressão Linear: {mse_lin}")
X = train.iloc[:, [2, 4, 5, 6, 9, 10]] Y = y2 #Y = y1 validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split( X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric seed = 7 scoring = 'neg_mean_squared_error' # Spot Check Algorithms models = [] models.append(('LR', LinearRegression())) models.append(('RF', RandomForestRegressor())) models.append(('KNN', KNeighborsRegressor())) models.append(('CART', DecisionTreeRegressor())) models.append(('SVM', SVR())) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
def evaluateModel(self, technique, train, test): if technique == 'Linear Regression': # Create linear regression object regr = linear_model.LinearRegression() elif technique == 'Kernel Ridge': regr = KernelRidge(alpha=1.0) elif technique == 'Ridge Regression': regr = linear_model.Ridge(alpha=.7) elif technique == 'Decision Tree': regr = tree.DecisionTreeRegressor(max_depth=10, min_samples_leaf=1) elif technique == "Random Forest": regr = RandomForestRegressor() elif technique == 'Gaussian Process': kernel = RationalQuadratic(length_scale=1.0, alpha=100) regr = gaussian_process.GaussianProcessRegressor(kernel=kernel, alpha=1) # min_max_scaler = preprocessing.MaxAbsScaler() # # train = min_max_scaler.fit_transform(train) # test = min_max_scaler.fit_transform(test) # train = preprocessing.scale(train) # test = preprocessing.scale(test) print('Total dataset size: ', len(self.data_array)) print('Train points', len(train)) print('Test points', len(test)) train_data_X = train[:, :-1] train_data_X[:, 0] = train_data_X[:, 0] / np.max(train_data_X[:, 0]) train_data_X[:, 1] = train_data_X[:, 1] / np.max(train_data_X[:, 1]) train_data_Y = train[:, -1] test_data_X = test[:, :-1] test_data_X[:, 0] = test_data_X[:, 0] / np.max(test_data_X[:, 0]) test_data_X[:, 1] = test_data_X[:, 1] / np.max(test_data_X[:, 1]) test_data_Y = test[:, -1] # Train the model using the training sets regr.fit(train_data_X, train_data_Y) # Make predictions using the testing set test_data_Y_predictions = regr.predict(test_data_X) print( "________________--------------------_________________------------------------" ) print(test_data_Y_predictions) print('--------Started-----------') for actual, pred in zip(test_data_Y, test_data_Y_predictions): print('Actual ' + str(actual) + ', predict ' + str(pred)) print('--------Ended-----------') mean_sq_error = mean_squared_error(test_data_Y, test_data_Y_predictions) r2_score_value = r2_score(test_data_Y, test_data_Y_predictions) # # The mean squared error print("Mean squared error: %.2f" % mean_sq_error) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % r2_score_value) test_concat = np.concatenate([test_data_Y, test_data_Y_predictions]) normalized_mean_squared_error = sqrt(mean_sq_error) / ( sum(test_concat) / len(test_concat)) print(normalized_mean_squared_error) return [self.split * 100, r2_score_value]
def validateRF(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraRFValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 113 y = 114 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x, y): os.chdir(dir_in) #filter only .csv files tgNames = [] for file in glob.glob("*.csv"): tgNames.append(file) tg_name = sorted(tgNames)[tg] print(tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): print("this tide gauge is already taken care of") return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = [] metric_rmse = [] #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model rf= RandomForestRegressor(n_estimators = 50, random_state = 101, \ min_samples_leaf = 1) rf.fit(X_train, y_train) #predictions predictions = rf.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) print() metric_rmse.append( np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame( [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis=0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name)