def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") if "depth" in parameters: model = RandomForestRegressor( max_depth=parameters["depth"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "leaf" in parameters: model = RandomForestRegressor( min_samples_leaf=parameters["leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "max_leaf" in parameters: model = RandomForestRegressor( max_leaf_nodes=parameters["max_leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def test_boston_housing_load_save_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error num_models_settings = [1, 2] for num_models in num_models_settings: explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) median_1 = explainer.predict(x_test) tmp_dir_name = tempfile.mkdtemp() explainer.save(tmp_dir_name) with self.assertRaises(ValueError): explainer.save(tmp_dir_name, overwrite=False) explainer.save(tmp_dir_name, overwrite=True) explainer.load(tmp_dir_name) median_2 = explainer.predict(x_test) self.assertTrue(np.array_equal(median_1, median_2)) shutil.rmtree(tmp_dir_name) # Cleanup.
def test_boston_housing_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error explainer = CXPlain(explained_model, model_builder, masking_operation, loss) explainer.fit(x_train, y_train) self.assertEqual(explainer.prediction_model.output_shape, (None, np.prod(x_test.shape[1:]))) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
def eval_one(step): if step in cached_results: return cached_results[step] eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) all_predictions = [] all_observations = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, eval_features, "target") model = RandomForestRegressor(min_samples_leaf = 2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) predictions = model.predict(testX) all_observations.extend(testY) all_predictions.extend(predictions) rmse = rmseEval(all_observations, all_predictions)[1] cached_results[step] = rmse # save down the cached result cache_output = open(CACHE_FILE, "a") step_list = [str(s) for s in step] step_str = ",".join(step_list) cache_output.write(str(rmse) + ";" + step_str + "\n") cache_output.close() return rmse
def post(self): # upload audio file in server voice = self.request.files["audio"][0] extn = os.path.splitext(voice['filename'])[1] fnm = os.path.splitext(voice['filename'])[0] cname = str(uuid.uuid4()) + extn fh = open(__UPLOADS__ + cname, 'w') fh.write(voice['body']) fh.close() # get features from the audio file attr = getAttributes(cname) fdf = mongoTolist(False) train = fdf[:,:-1] target = fdf[:,-1] #RandomForest Regression rf = RandomForestRegressor(n_estimators = 506, n_jobs = -1) rf.fit(train, target) updrs_val = rf.predict([attr]) attr.append(updrs_val[0]) # get the theta from database theta = list(db.theta.find({})) theta1 = theta[0]["theta1"] theta2 = theta[1]["theta2"] # check is the person has Parkinson's Disease isParkinson = octave.classify(theta1, theta2, np.array(attr)) self.render("output.html", ipk = isParkinson, updrs = updrs_val[0])
class RandomForestRegressorImpl(): def __init__(self, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def eval_one(min_samples_leaf, n_estimators): log("min_samples_leaf: " + str(min_samples_leaf) + ", n_estimators: " + str(n_estimators)) all_observations = [] all_pred_ALL = [] for group in range(0, len(groups)): trainStations = [] for i in range(0, len(groups)): if i != group: trainStations.extend(groups[i]) testStations = groups[group] train_station_set = set([float(s) for s in trainStations]) test_station_set = set([float(s) for s in testStations]) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, all_features, "target") model = RandomForestRegressor(min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_ALL = model.predict(testX) rmse = rmseEval(testY, prediction_ALL)[1] log("\tALL rmse: " + str(rmse)) all_observations.extend(testY) all_pred_ALL.extend(prediction_ALL) rmse = rmseEval(all_observations, all_pred_ALL)[1] log("\tALL rmse:" + str(rmse)) return rmse
def test_boston_housing_no_fit_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error explainer = CXPlain(explained_model, model_builder, masking_operation, loss) with self.assertRaises(AssertionError): explainer.predict(x_test, y_test) with self.assertRaises(AssertionError): explainer.score(x_test, y_test)
def trainRandomForest(data, columns, targetColumn, parameters): modelColumns = [] for column in columns: if column != targetColumn: modelColumns.append(column) modelData = [] for i in range(0, len(data[targetColumn])): record = [] for column in modelColumns: record.append(data[column][i]) modelData.append(record) if "depth" in parameters: model = RandomForestRegressor(max_depth=parameters["depth"], n_estimators=parameters["estimators"], n_jobs=-1, random_state=42) elif "leaf" in parameters: model = RandomForestRegressor(min_samples_leaf=parameters["leaf"], n_estimators=parameters["estimators"], n_jobs=-1, random_state=42) model.fit(modelData, data[targetColumn]) return RandomForestModel(model, modelColumns)
def RandomForest(x_train,y_train,x_test,degree): params = {'n_estimators': 1000, 'max_depth': degree, 'min_samples_split': 1,'warm_start':True} clf = RandomForestRegressor(**params) clf.fit(x_train, y_train) y_predict = clf.predict(x_test) #plt.plot(x_test,y_predict,color='red') return y_predict
def randomforestregressor(self, testlen, ntrain, ntrees, nodes): hsmadata = self.hsmadata dates = pd.Series(hsmadata['date'].unique()).sort_values() dates.index = range(0, len(dates)) ntest = len(dates) // testlen hsma = pd.DataFrame() for i in range(ntrain, ntest): traindata = hsmadata[ (hsmadata['date'] >= dates[(i - ntrain) * testlen]) & (hsmadata['date'] < dates[i * testlen - self.day])].copy() testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen]) & ( hsmadata['date'] < dates[(i + 1) * testlen])].copy() traindata = traindata.iloc[:, 2:] traindatax = traindata.drop(['closeratio'], 1) traindatay = traindata['closeratio'] testdatax = testdata[traindatax.columns] treemodel = RandomForestRegressor( n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) treemodel.fit(traindatax, traindatay) testdata['predratio'] = treemodel.predict(testdatax) hsma = pd.concat([hsma, testdata], ignore_index=True) return (hsma)
def test_shap_summary(self): data = self.iris.copy() widget = self.widget rf = SKL_RF(n_estimators=10) model = RandomForestRegressor(rf) rf.fit(data.X, data.Y) #self.send_signals([(widget.Inputs.data, data), (widget.Inputs.model, model)])
def test_boston_housing_confidence_level_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=3, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error num_models = 2 explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) invalid_confidence_levels = [1.01, -0.5, -0.01] for confidence_level in invalid_confidence_levels: with self.assertRaises(ValueError): explainer.predict(x_test, confidence_level=confidence_level)
def randomForest(trainFeatures, trainResponses, testFeatures, maxFeatures = 'log2', nTree=100): ## Settings of random forests regressor regModel = RandomForestRegressor(n_estimators=nTree, max_features=maxFeatures) ## Train the random forests regressor regModel.fit(trainFeatures, trainResponses) ## Prediction testResponsesPred = regModel.predict(testFeatures) return testResponsesPred
def evalTrainStationTestStation(trainStation, testStation, features): trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target") _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target") model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX2) rmse = rmseEval(testY2, prediction)[1] print("Training on station " + str(trainStation) + ", applying on station " + str(testStation) + ": rmse: " + str(rmse)) return rmse
def RF_Model(Scaled_Input_Data, Output_Data): T0 = time.time() n = len(Scaled_Input_Data) RFModel = RandomForestRegressor() RFModel.fit(Scaled_Input_Data, Output_Data) RF_Time = time.time() - T0 print('The computational time of Random Forest Regression for ', n, ' examples is: ', RF_Time) MSEs_RF = cross_validation.cross_val_score(RFModel, Scaled_Input_Data, Output_Data, cv=cross_validation.LeaveOneOut(n), scoring="mean_absolute_error") MeanMSE_RF = np.mean(list(MSEs_RF)) print('The average MSE of Random Forest Regression for ', n, ' examples is: ', (-1*MeanMSE_RF)) return(MeanMSE_RF, RFModel)
def run(self): print "Reading device separations..." indexes = np.load("indexesTrain.npy") self.train = self.train.values print "Getting attributes..." trainFeatures = [self.getMainFeatures(self.train, indexes, i) for i in range(len(indexes))] for i in range(len(indexes)): (trainVect, targetVect) = self.getAttributes(trainFeatures, indexes, i) classifier = RandomForestRegressor(n_estimators=500, verbose=2, n_jobs=4, random_state=1) classifier.fit(trainVect, targetVect) pickle.dump(classifier, open("models/models" + str(i) + ".mod", "w"))
def rf_lc(self, trainX, trainY): trainY_t = self.target_transform( trainY) clf = RandomForestRegressor( **self.rf_hyper) clf.fit( trainX, trainY_t) (mean_train, mean_test, max_train, max_test) = self.compute_error(clf, trainX, trainY) print ("mean_train err, mean_test err, followed by max: ", ( mean_train, mean_test, max_train, max_test)) self.log( ( mean_train, mean_test, max_train, max_test)) return (clf, mean_train, mean_test, max_train, max_test)
def train_model(X_train, y_train): model = RandomForestRegressor(n_estimators=50, criterion='mse', max_features='auto', max_depth=25, min_samples_split=1e-4, min_samples_leaf=1e-5, n_jobs=-1, verbose=10) model.fit(X_train, y_train) return model
def randomForestFeatures(df, X_train, y_train): """ INPUT: A dataframe to, X_train, y_train OUTPUT: A list of tuples ranking the feature importance generated from the PURPOSE: To identify in a robust easy-to-use way what features are most relevant """ names = df.iloc[:,1:-1].columns rf = RandomForestRegressor() rf.fit(X_train, y_train) tups = (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), reverse=True)) return tups
def train_model(X_train, y_train): print("training the model ...") rf = RandomForestRegressor(n_estimators=500, max_depth=5, n_jobs=-1, verbose=2) rf.fit(X_train, y_train) y_pred_train = rf.predict(X_train) print(".. training RMSE : {:0.3f} %".format( mean_squared_error(y_train, y_pred_train) * 100)) #print(".. training R2 : {:0.3f} %".format(r2_score(y_train,y_pred_train)*100)) print(".. training MAE : {:0.3f} %".format( mean_absolute_error(y_train, y_pred_train) * 100)) return rf
def RandomForest(weiboid, x_train, y_train, x_test, y_test, d): params = { 'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1, 'warm_start': True, 'oob_score': True } clf = RandomForestRegressor(**params) clf.fit(x_train, y_train) y_predict = clf.predict(x_test) r = rmse(y_test, y_predict) #fig(weiboid,y_test,y_predict) return y_predict, r
def RandomForest(x_train,y_train,x_test,y_test): degree = [1,2,3,4,7] result = {} rmse_list = [] for d in degree: params = {'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1,'warm_start':True} clf = RandomForestRegressor(**params) clf.fit(x_train[:, np.newaxis], y_train) y_predict = clf.predict(x_test[:, np.newaxis]) rmsevalue = rmse(y_test,y_predict) result[rmsevalue] = [y_predict,d] rmse_list.append(rmsevalue) rmseMin = min(rmse_list) return rmsevalue,result[rmseMin]
def test_overwrite_ensemble_model_invalid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() model_builder = MLPModelBuilder() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) masking_operation = ZeroMasking() loss = binary_crossentropy num_models = 5 explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) file_names = [ CXPlain.get_config_file_name(), CXPlain.get_explained_model_file_name(".pkl"), CXPlain.get_loss_pkl_file_name(), CXPlain.get_model_builder_pkl_file_name(), CXPlain.get_masking_operation_pkl_file_name() ] # Test with untrained explanation model. for file_name in file_names: tmp_dir = TestExplanationModel.make_at_tmp(file_name) with self.assertRaises(ValueError): explainer.save(tmp_dir, overwrite=False) # Test with trained explanation model. explainer.fit(x_train, y_train) file_names = [ CXPlain.get_config_file_name(), CXPlain.get_explained_model_file_name(".pkl"), CXPlain.get_loss_pkl_file_name(), CXPlain.get_model_builder_pkl_file_name(), CXPlain.get_masking_operation_pkl_file_name() ] + [ CXPlain.get_prediction_model_h5_file_name(i) for i in range(num_models) ] for file_name in file_names: tmp_dir = TestExplanationModel.make_at_tmp(file_name) with self.assertRaises(ValueError): explainer.save(tmp_dir, overwrite=False)
def doPrediction(locations, data, columns, features, columns2, outputFileName): predictionData = {} for c in columns2: predictionData[c] = [] # modelling for location in locations: trainX, testX, trainY, testY, dataY = splitDataForXValidation( location, "location", data, features, columns, "target") print("\tT+W #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=2, n_estimators=650, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) for c in columns2: if c == 'prediction': predictionData[c].extend(prediction) else: predictionData[c].extend(dataY[c]) for c in predictionData: print("\t" + c + " -> #" + str(len(predictionData[c]))) rmse = rmseEval(predictionData['target'], predictionData['prediction'])[1] print("overall RMSE: " + str(rmse)) print("Writing out results...") output = open(outputFileName, 'w') output.write(','.join([str(x) for x in columns2])) output.write("\n") for i in range(0, len(predictionData['target'])): output.write(str(predictionData[columns2[0]][i])) for j in range(1, len(columns2)): output.write(",") output.write(str(predictionData[columns2[j]][i])) output.write("\n") output.close() print("Done...")
def test_boston_housing_valid(self): (x_train, y_train), (x_test, y_test) = TestUtil.get_boston_housing() explained_model = RandomForestRegressor(n_estimators=64, max_depth=5, random_state=1) explained_model.fit(x_train, y_train) model_builder = MLPModelBuilder(num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=3, early_stopping_patience=128) masking_operation = ZeroMasking() loss = mean_squared_error for num_models in [2, 5, 10]: explainer = CXPlain(explained_model, model_builder, masking_operation, loss, num_models=num_models) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median, confidence = explainer.predict(x_test, confidence_level=0.95) self.assertTrue(median.shape == x_test.shape) self.assertTrue(confidence.shape == x_test.shape + (2, )) # Flatten predictions for iteration below. median = median.reshape((len(x_test), -1)) confidence = confidence.reshape((len(x_test), -1, 2)) for sample_idx in range(len(x_test)): for feature_idx in range(len(x_test[sample_idx])): self.assertTrue(confidence[sample_idx][feature_idx][0] <= median[sample_idx][feature_idx] <= confidence[sample_idx][feature_idx][1]) self.assertTrue( confidence[sample_idx][feature_idx][0] >= 0) self.assertTrue( confidence[sample_idx][feature_idx][1] >= 0)
def perform_random_forest_regressor(train_set, train_target, test_set, predictors, estimators=10, depth=None, splits=2): alg = RandomForestRegressor(random_state=1) alg.fit(train_set[predictors], train_target) #importances = alg.feature_importances_ #print("Original ",numpy.argsort(importances)) #indices = numpy.argsort(importances)[::-1] #print (" importances ",importances) #print (" indices ",indices) #for f in range(train_set.shape[1]-2): # print("%2d) %-*s %f" % (f+1,30,predictors[indices[f]], # importances[indices[f]])) predictions = alg.predict(test_set[predictors]) return predictions;
class QuantileForestRegression(absmodel.Module): def __init__(self, n_estimator=500): super(QuantileForestRegression, self).__init__() self.model = RandomForestRegressor(n_estimators=n_estimator) self.fitted = False def _fit(self, x, y, verbose=False, load=False): return self.model.fit(x, y) def predict(self, x, y, label=None): d, up = self.pred_ints(model=self.model, x=x) return d, up def pred_ints(self, model, x, percentile=95): err_down = [] err_up = [] for i in range(len(x)): preds = [] for pred in model.estimators_: preds.append(pred.predict(x[i])[0]) err_down.append(np.percentile(preds, (100 - percentile) / 2.)) err_up.append(np.percentile(preds, 100 - (100 - percentile) / 2.)) return err_down, err_up
def calcRandomForest(channels_training, channels_testing, target_training, target_testing): clf = RandomForestRegressor(n_estimators=500, max_features=len(channels_training[0])) clf = clf.fit(channels_training, target_training) predictions = clf.predict(channels_testing) comp = [predictions, target_testing] return clf, comp
def main(train_file='train.csv', test_file='test.csv', output_file='predict.csv'): print "Loading data..." train_data = pd.read_csv(train_file) test_data = pd.read_csv(test_file) y = np.array(train_data[["ACTION"]]) #X = np.array(train_data.ix[:,1:-1]) # Ignores ACTION, ROLE_CODE X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]]) X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE SEED = 4 #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y) clf = RandomForestRegressor(n_estimators=300, min_samples_split=15, min_density=0.1,compute_importances=True).fit(X,y) print clf.feature_importances_ #Try feature selection mean_auc = 0.0 n = 10 for i in range(n): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.10, random_state=i*SEED) # if you want to perform feature selection / hyperparameter # optimization, this is where you want to do it # train model and make predictions clf.fit(X_train, y_train) preds = clf.predict(X_cv) # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) mean_auc += roc_auc print "Mean AUC: %f" % (mean_auc/n) predictions = clf.predict_(X_test) #print predictions #print 'Writing predictions to %s...' % (output_file) create_test_submission(output_file, predictions) return 0
def rf(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014, data2013, data2014): columns = [] for c in data2013: columns.append(c) columns.remove("location") columns.remove("timestamp") columns.remove("target") X = [] y = [] for i in range(0, len(data2013["target"])): timestamp = str(int(data2013["timestamp"][i])) weekC = timestampWeekCategory[timestamp] if int(weekC) >= week: y.append(data2013["target"][i]) x = [] for c in columns: x.append(data2013[c][i]) X.append(x) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(X, y) # print(str(len(X))) X = [] y = [] for i in range(0, len(data2014["target"])): y.append(data2014["target"][i]) x = [] for c in columns: x.append(data2014[c][i]) X.append(x) prediction = model.predict(X) rmse = rmseEval(y, prediction) return rmse
def run(self): # extract data from the batch df_train = pd.read_csv(self.input().path, header=[0, 1]) X, y = preprocess2(df_train, snr=10.) # train regressor reg = RandomForestRegressor(10, min_samples_leaf=10, max_depth=9, n_jobs=-1) # reg = KNeighborsRegressor(algorithm="auto") # reg = LinearRegression() # reg = sklearn.svm.SVR(kernel="rbf", degree=3, C=100., gamma=10.) # reg = LinearSaO2Unmixing() reg.fit(X, y.values) # reg = LinearSaO2Unmixing() # save regressor regressor_file = self.output().open('w') pickle.dump(reg, regressor_file) regressor_file.close()
def make_models(self, missing_columns): available_table = self.full_table.copy() #clear out the table for column in missing_columns: del available_table[column] available_features = available_table.as_matrix() clfs = {} #build a model for each missing column for column in missing_columns: labels = self.full_table.as_matrix(columns = [column]) labels = np.reshape(labels, (len(labels))) #unnest the arrays clf = RandomForestRegressor(n_estimators = 100) clf.fit(available_features, labels, available_table['WGTP']) clfs[column] = clf return clfs
def eval_one(features): all_predictions = [] all_observations = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf=2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) predictions = model.predict(testX) all_observations.extend(testY) all_predictions.extend(predictions) rmse = rmseEval(all_observations, all_predictions)[1] log("\tRMSE: " + str(rmse))
def modeltrain(X_train, y_train, X_test, y_test): from sklearn.ensemble.forest import RandomForestRegressor # Generando el modelo RF_Model = RandomForestRegressor(n_estimators=100,max_features=1) # Ajustando el modelo con X_train y y_train rgr = RF_Model.fit(X_train, y_train) y_train_predict = (rgr.predict(X_train)).astype(int) y_test_predict = (rgr.predict(X_test)).astype(int) return y_train_predict , y_test_predict , rgr
def test_fmt_sklearn_preds_regression(self): """test fmt_sklearn_preds on regression case""" modelobj_regr = RandomForestRegressor() model_df =self.df.loc[:, self.df.columns != 'target'] modelobj_regr.fit(model_df, self.df.loc[:, 'target']) fmtd_outputs = fmt_model_outputs.fmt_sklearn_preds(getattr(modelobj_regr, 'predict'), modelobj_regr, model_df, self.df, 'target', 'regression') self.assertIn('predictedYSmooth', fmtd_outputs.columns.values, """fmt_sklearn_preds on regression case does not return predictions""")
def predict_per_cpu_full(): data, target = load_data() data, target, labels = normalize_data(data, target) data = data[['C0', 'cpuFull']] data['target'] = target split_by_types = dict() cpu_groups = data.groupby('cpuFull') for name, group in cpu_groups: X_train, X_test, y_train, y_test = train_test_split(group['C0'].reshape(-1, 1), group['target']) split_by_types[str(name)] = { 'train': { 'data': X_train, 'target': y_train }, 'test': { 'data': X_test, 'target': y_test } } # print split_by_types summ = 0.0 for cpu, data_set in split_by_types.iteritems(): plt.figure() # reg = SGDRegressor(loss='huber', n_iter=100, alpha=0.0) reg = RandomForestRegressor(n_estimators=5) reg.fit(data_set['train']['data'], data_set['train']['target']) test_data = data_set['test']['data'] y_pred = reg.predict(test_data) print mape(data_set['test']['target'], y_pred), cpu plt.scatter(test_data, data_set['test']['target'], s=3, color='g', label='actual') plt.scatter(test_data, y_pred, s=3, color='r', label='predicted') plt.legend(loc='upper left') plt.ylabel('mul time') plt.title('Category: {}'.format(cpu)) plt.savefig('imgs/{}.png'.format(cpu))
def train(data,val_ind,indices): max_numb = val_ind.shape[1] regs = [] for i in range(max_numb): regs.append(0) for i in indices: # print i # reg = sklearn.linear_model.Lasso(max_iter=3000) reg = RandomForestRegressor() # reg=skl.tree.DecisionTreeRegressor() # reg = skl.linear_model.LinearRegression() # reg = AdaBoostRegressor() # print val_ind.shape # print val_ind[:,i] # print data.shape # print data[0] # print len(val_ind[:,i]) reg.fit(data,val_ind[:,i]) regs[i]=reg return regs
import pandas as pd from sklearn.ensemble.forest import RandomForestRegressor import time dset = pd.read_csv("./data/concrete_data.csv") X = dset.iloc[:, 0:7] y = dset.iloc[:, 8] estimator = RandomForestRegressor(max_features = 3, n_estimators = 50, n_jobs = 1, oob_score = True) t0 = time.time() estimator.fit(X, y) print(time.time() - t0)
# fit a linear model with no bells and whistles model = linear_model.LinearRegression() model.fit(train_X, train_Y) # look at the r squared on the training data and the test data model.score(train_X, train_Y) model.score(test_X, test_Y) # See if I can get the r squared on the test data lower by using more complex models # random forest forest = RandomForestRegressor() # fit the data without using cross val to select parameters # note that train score is much higher than test score forest.fit(train_X, train_Y) forest.score(train_X, train_Y) forest.score(test_X, test_Y) # fit a random forest regressor using grid search to # select the number of trees and max depth new_forest = RandomForestRegressor() params_grid = [{'max_depth': [3, 5,10, None], 'n_estimators': [5,10,15,20, 50, 80]} ] grid_search = GridSearchCV(new_forest, params_grid, cv=10) grid_search.fit(train_X, train_Y) grid_search.score(test_X, test_Y) grid_search.best_estimator_ # fit a boosted regression boost = GradientBoostingRegressor()
# Look at scatter plot of OTU abundance vs. age to visualize the correlation fig, ax= plt.subplots() ax.scatter(x, y) ax.set_xlabel('OTU #' + otu) ax.set_ylabel('Age') ax.text(0.01,0.95, r'$\rho$ = {:.2f}'.format(r), transform=ax.transAxes) #%% 3. Build a Random Forest Regressor ## 3.1 Build the regressor rfreg = RandomForestRegressor(n_estimators=1000, oob_score=True) # We aren't classifying samples here, so we can just use the whole OTU table to build our regression X = data.abun_df.values Y = [float(data.meta_df.loc[smpl, 'BMI']) for smpl in data.abun_df.index] rfreg = rfreg.fit(X,Y) ## 3.1.1 Look at true vs. predicted values from out of bag estimations fig, ax = plt.subplots() ax.scatter(Y, rfreg.oob_prediction_) ax.set_xlabel('True') ax.set_ylabel('Predicted') ax.set_title('RF regression on BMI') ## 3.2 Look at the important features in the regression by inspecting their coefficient weights feats = pd.DataFrame(index=data.abun_df.columns, columns=['importance'], data=rfreg.feature_importances_) feats = feats.sort(columns='importance', ascending=False) feats['normalized_importance'] = feats['importance']/max(feats['importance']) # Look at the top 5 features and their importance. Each row name is the OTU ID feats.head(5)
# Initiate the monthly trade object monthData = trade_model.monthlyModel(1, 2009, 6, 2013, 6, 2012, 6, 2013) # Download data from Yahoo finance monthData.monthlyDataDownload() # Pre-processing of training an testing data monthData.trainFeaturePre() # Read pre-processed data from hard drive # monthData.trainFeaturePreHd() # Number of training months trainSpan = len(monthData.xTrain[:,0,0]) - monthData.testSpan # Initiate a random forest regressor clf = RandomForestRegressor(n_estimators=10) # totalReturn = 1 predictedReturn = np.zeros(monthData.stockNum) monthlyReturn = np.zeros(monthData.testSpan) aggReturn = np.zeros(monthData.testSpan+1) aggReturn[0] = 1 # rolling training and testing for j in range(0, monthData.testSpan): for i in range(0, monthData.stockNum): clf.fit(monthData.xTrain[j:trainSpan+j, :, i], monthData.yTrain[j:trainSpan+j, 0, i]) predictedReturn[i] = clf.predict(monthData.xTest[j, :, i]) monthlyReturn[j] = monthData.por10Returns(j, predictedReturn) yearReturn = totalReturn * (monthlyReturn[j]+1) aggReturn[j+1] = aggReturn[j]*(1+monthlyReturn[j]) print monthlyReturn print 'overall:', totalReturn sp.portfolioVSspy(6, 2012, 6, 2013, aggReturn[1:])
nucleus = 'N' # make command line option; supported nuclei are ['H','N','CA','HA','CB','C'] # Generate training and test set X_train,y_train = OrganizeData(nucleus, 'train') X_test, y_test = OrganizeData(nucleus, 'test') # Feature scaling X_train_scaled = preprocessing.scale(X_train) X_test_scaled = preprocessing.scale(X_test) # Set the parameters for the random forest estimator estimator = RandomForestRegressor(n_estimators=50, max_features=16, max_depth=25, min_samples_split=5, min_samples_leaf=5, random_state=0) # Build the random forest of regression trees from the training set estimator = estimator.fit(X_train_scaled,y_train) print estimator.score(X_train_scaled,y_train) print estimator.score(X_test_scaled,y_test) # Predict regression target for the test set predicted = estimator.predict(X_train_scaled) cc = np.corrcoef(y_train,predicted) print cc print estimator #my_plotting.simple_plot_overlay(y_train,predicted) predicted = estimator.predict(X_test_scaled) cc = np.corrcoef(y_test,predicted) print cc print estimator
def fit(self, X, y, sample_weight=None): sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight, pow_sig=self.pow_sig, pow_bg=self.pow_bg) target = sample_weight + self.gap target[y == 0] *= -1 RandomForestRegressor.fit(self, X, y, sample_weight=sample_weight)
newtrain = pd.DataFrame(newtrain, columns = cols) newtest = pd.DataFrame(newtest, columns = cols) #test = test.join(pd.DataFrame(test.Date.apply(splitTime).tolist(), columns = ['year','mon','day'])) #newtest = test.drop('StateHoliday',1).join(pd.get_dummies(test['StateHoliday']).rename(columns=lambda x: 'StateHoliday' +"_"+str(x))) #newtest = pd.merge(newtest,store, on="Store") #newtest.drop(['Date'],axis = 1,inplace=True) #assert(np.sum(newtrain.var()==0)==0) # #toDrop = list(set(newtrain.columns.values)-set(newtest.columns.values) ) features = [col for col in newtrain.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes']] # rf = RandomForestRegressor(n_estimators=100) print('Starting training...') rf.fit(newtrain[features].fillna(-1),newtrain.LogSale) print('Predicting train values...') newtrain['mypred'] = rf.predict(newtrain[features].fillna(-1)) newtrain['mypred'] = np.exp(newtrain['mypred'])-1 train_error = rmspe(newtrain[newtrain.Sales>0].Sales,newtrain[newtrain.Sales>0].mypred) print('train set error',train_error) newtest['mypred'] = rf.predict(newtest[features].fillna(-1)) newtest['mypred'] = np.exp(newtest['mypred'])-1 test_error = rmspe(newtest[newtest.Sales>0].Sales,newtest[newtest.Sales>0].mypred) print('test set error',test_error) train_results.append(train_error) test_results.append(test_error) print('mean train error', np.mean(train_results)) print('mean test error',np.mean(test_results))
all_data_valid, all_targets_valid = generate_array(hdulist_valid, feature_index, target_index) clf_adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8), n_estimators=50, loss='linear', random_state=0) clf_extra_trees = ExtraTreesRegressor(n_estimators=8, random_state=0, max_depth=30) clf_random_forest = RandomForestRegressor(n_estimators=8, random_state=0, max_depth=30) clf_adaboost.fit(all_data_test.T, all_targets_test[0]) predicted = clf_adaboost.predict(all_data_valid.T) clf_extra_trees.fit(all_data_test.T, all_targets_test[0]) predicted_extra = clf_extra_trees.predict(all_data_valid.T) clf_random_forest.fit(all_data_test.T, all_targets_test[0]) predicted_forest = clf_random_forest.predict(all_data_valid.T) delta_ada = all_targets_valid[0] - predicted delta_extra = all_targets_valid[0] - predicted_extra delta_forest = all_targets_valid[0] - predicted_forest std_ada = get_standart_deviation(delta_ada) std_extra = get_standart_deviation(delta_extra) std_forest = get_standart_deviation(delta_forest) plt.hist(delta_ada, bins=150, color='g', label='Adaboost '+str(np.round(std_ada,4))) plt.hist(delta_extra, bins=150, color='b', label='Extra_Trees '+str(np.round(std_extra,4))) plt.hist(delta_forest, bins=150, color='r', label='Random_Forest '+str(np.round(std_forest,4))) title = "Compare adaboost, extra_tree and Random_Forests" plt.title(title)
predictions3 = [] predictions4 = [] offset = int(0.7 * len(X)) for i in range(10): X, y = shuffle(boston.data, boston.target) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140) regressor2 = DecisionTreeRegressor(max_depth=6) regressor3 = LinearRegression() regressor4 = RandomForestRegressor() regressor.fit(X_train, y_train) regressor2.fit(X_train, y_train) regressor3.fit(X_train, y_train) regressor4.fit(X_train, y_train) y_pred = regressor.predict(x) y_pred2 = regressor2.predict(x) y_pred3 = regressor3.predict(x) y_pred4 = regressor4.predict(x) predictions.append(y_pred) predictions2.append(y_pred2) predictions3.append(y_pred3) predictions4.append(y_pred4) print "\nPrediction = " + str(y_pred) print "Prediction = " + str(y_pred2) print "Prediction = " + str(y_pred3) print "Prediction = " + str(y_pred4) print '\n' print 'Boosting max', np.max(predictions), 'min', np.min(predictions), 'variance', np.max(predictions) - np.min(predictions)
class ItemSetModel(object): """docstring for ItemSetModel""" clf = None MODEL_PATH = os.path.join(settings.BASE_DIR, 'set_analyzer', 'analysis', 'models') CACHE_FILE = os.path.join(MODEL_PATH, 'model_cache.cache') def __init__(self): super(ItemSetModel, self).__init__() #self.clf = DecisionTreeRegressor() #self.clf = Lasso(0.1) #self.clf = SVR(kernel='rbf') #self.clf = ElasticNetCV() self.clf = RandomForestRegressor(max_depth=7, n_estimators=10) def get_data_sets(self, num_matches, cache=False, **kwargs): """ Data Schema: Input: 1 My champion ID 6 My Champion's class info 6 [Other team's cumulative class info] 7 [7 Final Items] 5 [first 5 items purchased] ________________________________________ 25 features Output: Score = A(Gold/time) + B(xp/time) + C(win) ________________________________________ 1 Output """ #Presize data features = 25 num_participants = num_matches*10 input_data = np.zeros((num_participants, features)) output_data = np.zeros(num_participants) row_num = 0 get_champ_id = lambda x : x.champion.champion_id diff_team = lambda x , y : x.team_id != y.team_id item_purchased = lambda x: x.event_type == "ITEM_PURCHASED" #Iterate over every match in the database for match in Match.objects(**kwargs)[:num_matches]: #Prepare users and teams team_map = {} team_data = np.zeros((2,6)) #Store the sum of each team's tags count = 0 for tag in match.teams: team_map[int(tag)] = count count+=1 #Prepare champion class data for p in match.participants.values(): for tag in p.champion.tags: team_data[team_map[p.team_id], :] += np.array(p.champion.class_data) #Iterate over every user in the match for pid, participant in match.participants.items(): col_num = 0 #My Champion's info input_data[row_num][col_num] = get_champ_id(participant) col_num+=1 input_data[row_num][col_num:col_num+6] = np.array(participant.champion.class_data) col_num+=6 #Other Team's champion attributes if(team_map[participant.team_id] == 0): input_data[row_num][col_num:col_num+6] = team_data[1,:] else: input_data[row_num][col_num:col_num+6] = team_data[0,:] col_num+=6 #My items for item_id in participant.final_build: input_data[row_num][col_num] = item_id col_num+=1 #My Item purchases count = 0 for item_purchase in (x for x in participant.item_events if item_purchased(x)): if(count==5): break input_data[row_num][col_num] = item_purchase.payload['itemId'] col_num += 1 count += 1 #Score # Assume that average gold/sec is ~8 # Assume that average kda is ~2.6 # Have a game win worth some bonus score = participant.kda()*3 + participant.gold_earned/match.duration + (4 if match.teams[str(participant.team_id)].won else 0) output_data[row_num] = score row_num+=1 if(cache): print('Caching data...') self.cache_data((input_data, output_data)) return (input_data, output_data) def cache_data(self, data): with open(self.CACHE_FILE, 'wb') as f: pickle.dump(data, f) def get_cached_data(self, num_rows): with open(self.CACHE_FILE, 'rb') as f: return pickle.load(f)[:num_rows] def train(self, X, Y, train_ratio=1, **kwargs): print("Training model...") if(train_ratio==1): print("Using {} rows".format(len(X))) self.clf.fit(X,Y) else: n = len(X) tn = int(n*train_ratio) print("Using {} rows".format(tn)) self.clf.fit(X[:tn,:],Y[:tn]) print("Evaluating model...") evaluate_fit(self.clf, X[tn:,:],Y[tn:]) def predict(self, X): return self.clf.predict(X) #MODEL EVAUATION def k_fold(self, folds, **kwargs): X, Y = self.get_data_sets(**kwargs) k_fold_evaluate(self.clf, X, y, folds) #LOAD AND SAVE def save(self, filename): dirname = os.path.join(self.MODEL_PATH, filename) if(not os.path.exists(dirname)): os.makedirs(dirname) else: #Empty folder for file in os.listdir(dirname): file_path = os.path.join(dirname, file) if os.path.isfile(file_path): os.unlink(file_path) path = os.path.join(dirname, "{}.pkl".format(filename)) joblib.dump(self.clf, path) def load(self, filename): path = os.path.join(self.MODEL_PATH, filename, "{}.pkl".format(filename)) self.clf = joblib.load(path)
store = store.drop("Assortment", 1).join( pd.get_dummies(store["Assortment"]).rename(columns=lambda x: "Assortment" + "_" + str(x)) ) train["StateHoliday"] = [mychange(x) for x in train.StateHoliday] test["StateHoliday"] = [mychange(x) for x in test.StateHoliday] train = train.drop("StateHoliday", 1).join( pd.get_dummies(train["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x)) ) test = test.drop("StateHoliday", 1).join( pd.get_dummies(test["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x)) ) train = pd.merge(train, store, on="Store") test = pd.merge(test, store, on="Store") repeat = 1 print("Splitting data...") for i in range(repeat): features = [col for col in test.columns if col not in ["Customers", "Sales", "Date", "LogSale", "datetimes", "Id"]] rf = RandomForestRegressor(n_estimators=100) print("Starting training...") rf.fit(train[features].fillna(-1), train.LogSale) test["mypred"] = rf.predict(test[features].fillna(-1)) test["mypred"] = np.exp(test["mypred"]) - 1 test["Sales"] = test.mypred test[["Id", "Sales"]].to_csv("rand_for_kag_v4-9.csv", index=False)
def ts_rf(n,fea,step,ntrees,njobs): #Random Forest Model for time series prediction #from sklearn import svm import math from sklearn import metrics import matplotlib.pyplot as plt from scipy.linalg import hankel import numpy as np from sklearn.ensemble.forest import RandomForestRegressor #input data from csv file #use n datapoints #n=1100 # # of features of training set ## fre=50 # # how many steps to predict #step=29 #fea=50 path='/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls.txt' path1 = '/Users/royyang/Desktop/time_series_forecasting/csv_files/coffee_ls_nor.txt' result_tem=[] date = [] with open(path) as f: next(f) for line in f: item=line.replace('\n','').split(' ') result_tem.append(float(item[1])) date.append(item[2]) mean = np.mean(result_tem) sd = np.std(result_tem) result=(result_tem-mean)/sd #form hankel matrix X=hankel(result[0:-fea-step+1], result[-1-fea:-1]) y=result[fea+step-1:] #split data into training and testing Xtrain=X[:n] ytrain=y[:n] Xtest=X[n:] ytest=y[n:] # random forest rf = RandomForestRegressor(n_estimators = ntrees, n_jobs=njobs) rf_pred = rf.fit(Xtrain, ytrain).predict(Xtest) #a = rf.transform(Xtrain,'median') #plot results LABELS = [x[-6:] for x in date[n+fea+step-1:n+fea+step-1+len(ytest)]] t=range(n,n+len(ytest)) # plt.show() # plt.plot(t,y_lin1,'r--',t,ytest,'b^-') # plt.plot(t,y_lin2,'g--',t,ytest,'b^-') ypred = rf_pred*sd+mean ytest = ytest*sd+mean line1, = plt.plot(t,ypred,'r*-') plt.xticks(t, LABELS) line2, = plt.plot(t,ytest,'b*-') # plt.xlim([500,510]) plt.legend([line1, line2], ["Predicted", "Actual"], loc=2) #plt.show() #plt.plot(xrange(n),result[0:n],'r--',t,y_lin3,'b--',t,ytest,'r--') y_true = ytest y_pred = ypred metrics_result = {'rf_MAE':metrics.mean_absolute_error(y_true, y_pred),'rf_MSE':metrics.mean_squared_error(y_true, y_pred), 'rf_MAPE':np.mean(np.abs((y_true - y_pred) / y_true)) * 100} print metrics_result
def train(training, k): model = RandomForestRegressor(n_estimators=k, n_jobs=-1) model.fit(training[:,:-1], training[:,-1]) return model
import json household = pd.read_csv("../household_complete_one_hot.csv") if 'KWH' in household.columns: del household['KWH'] X_columns = [column for column in household.columns if column != "ELEP"] X = household.as_matrix(columns = X_columns) y = [label[0] for label in household.as_matrix(columns = ["ELEP"])] #print(y) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf = RandomForestRegressor(n_estimators = 100, n_jobs = 8) clf.fit(X_train, y_train) print(y_test[:100]) print(metrics.mean_squared_error(clf.predict(X_test), y_test)) print(metrics.r2_score(y_test, clf.predict(X_test))) features = sorted(zip(X_columns, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features) #fill spaces in ELEP normalized_pums = pd.read_csv("../joined_weather.csv", delimiter = ',') print('pums shape', normalized_pums.shape) with open("../vectorized_puma_regions/puma_list.json") as f: puma_mapping = json.load(f)
auto = auto_mapper.fit_transform(auto_df) store_pkl(auto_mapper, "Auto.pkl") auto_X = auto[:, 0:7] auto_y = auto[:, 7] print(auto_X.dtype, auto_y.dtype) def predict_auto(regressor): mpg = DataFrame(regressor.predict(auto_X), columns = ["mpg"]) return mpg auto_tree = DecisionTreeRegressor(random_state = 13, min_samples_leaf = 5) auto_tree.fit(auto_X, auto_y) store_pkl(auto_tree, "DecisionTreeAuto.pkl") store_csv(predict_auto(auto_tree), "DecisionTreeAuto.csv") auto_forest = RandomForestRegressor(random_state = 13, min_samples_leaf = 5) auto_forest.fit(auto_X, auto_y) store_pkl(auto_forest, "RandomForestAuto.pkl") store_csv(predict_auto(auto_forest), "RandomForestAuto.csv") auto_regression = LinearRegression() auto_regression.fit(auto_X, auto_y) store_pkl(auto_regression, "RegressionAuto.pkl") store_csv(predict_auto(auto_regression), "RegressionAuto.csv")
train['LogSale'] = np.log(train.Sales+1) train=pd.merge(train, store, on="Store") test = pd.merge(test, store, on="Store") processdata(train) processdata(test) repeat = 1 #print('Splitting data...') for i in range(repeat): features = [col for col in test.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes','Id']] ##!!!for submission should be test.columns!!! # features = ['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2',\ # 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'DayOfWeek', 'mon', 'day', 'year', 'StoreType', 'Assortment'] # ^^ features taken from xgb model on Kaggle rf = RandomForestRegressor(n_estimators=100) print('Starting training...') rf.fit(train[features],train.LogSale) # train['mypred'] = rf.predict(train[features]) # train['mypred'] = np.expm1(train.mypred) # train_error = rmspe(train[train.Sales>0].Sales,train[train.Sales>0].mypred) # print(train_error) test['mypred'] = rf.predict(test[features]) test['mypred'] = np.exp(test['mypred'])-1 test['Sales'] = test.mypred test[[ 'Id', 'Sales' ]].to_csv('rand_for_kag_v4-8.csv', index = False )
def train_random_forest(X, Y): rf = RandomForestRegressor(n_estimators=20) rf.fit(X, Y) return rf
# In[11]: print 'Train Random Forests!' from sklearn.ensemble.forest import RandomForestRegressor RF = RandomForestRegressor(n_estimators = 500, random_state = 0) # In[12]: Rows = np.random.choice(Train.index.values, 400000) Sampled_Train = Train.ix[Rows] Sample_Train_Target = Train_Target.ix[Rows] # RF.fit(Sampled_Train, Sample_Train_Target) RF.fit(Train, Train_Target) # In[ ]: print 'Predict!' Test_Predict = RF.predict(Test.as_matrix()) # In[ ]: print Test_Predict.shape # In[ ]:
clf.fit(X, y) clf.predict(z) ######################### from sklearn.ensemble.forest import RandomForestRegressor regressor = RandomForestRegressor() parameters = [{"n_estimators": [250, 500, 1000,2000]}] # Returns the best configuration for a model using crosvalidation # and grid search import time regressor = RandomForestRegressor(n_estimators=300, min_samples_split=1,max_features=67) regressor.fit(train_np,energy) pred=regressor.predict(test_np) print explained_variance_score(energy_test,pred) print mean_squared_error(energy_test,pred) r2_score(energy_test,pred) ##prediction comparison comp = pd.read_csv("H:/bee-efficiency/cisco presentation/pred.csv")
def _2011x2011_ (data_path): ##### LOADING ##### sys.stdout.write("Loading data... ") # Load data from .csv file with open(data_path+'_X.csv') as data_file: reader = csv.reader(data_file) # Initialize lists for data and class labels data =[] # skip header next(reader, None) # For each row of the csv file for row in reader: data.append([float(x) for x in row]) with open(data_path+'_y.csv') as labels_file: reader = csv.reader(labels_file) # Initialize lists for data and class labels val_ind =[] # skip header next(reader, None) # For each row of the csv file for row in reader: val_ind.append(row) sys.stdout.write("done\n") ##### TRAINING ##### # splitting data_train, data_test, val_ind_train, val_ind_test \ = skl.cross_validation.train_test_split(data, val_ind, test_size=0.4, random_state=42) # Cutting date/ ASS/ number value from labels date_train = [x[0] for x in val_ind_train] # ASS_train = [x[1] for x in val_ind_train] val_train = [float(x[1]) for x in val_ind_train] date_test = [x[0] for x in val_ind_test] # ASS_test = [x[1] for x in val_ind_test] val_test = [float(x[1]) for x in val_ind_test] sys.stdout.write("Training regressor... ") reg = RandomForestRegressor() # reg = skl.tree.DecisionTreeRegressor() # reg = skl.linear_model.LinearRegression() reg.fit(data_train, val_train) sys.stdout.write("done\n") ##### PREDICTION ##### sys.stdout.write("Predicting... ") val_predicted = reg.predict(data_test) sys.stdout.write("done\n") ##### ERROR ##### df = pd.DataFrame() df['date'] = pd.to_datetime(date_test) # df['ASS'] = ASS_test df['original'] = val_test df['predicted'] = val_predicted.tolist() df = df.set_index('date') # df = df.loc[df['ASS'] == 'CAT'] # one example df.info() df.plot() plt.show() print "MSE : " + str(mean_squared_error(val_test,val_predicted.tolist()))