def RandomForest(x_train,y_train,x_test,degree): params = {'n_estimators': 1000, 'max_depth': degree, 'min_samples_split': 1,'warm_start':True} clf = RandomForestRegressor(**params) clf.fit(x_train, y_train) y_predict = clf.predict(x_test) #plt.plot(x_test,y_predict,color='red') return y_predict
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") if "depth" in parameters: model = RandomForestRegressor( max_depth=parameters["depth"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "leaf" in parameters: model = RandomForestRegressor( min_samples_leaf=parameters["leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) elif "max_leaf" in parameters: model = RandomForestRegressor( max_leaf_nodes=parameters["max_leaf"], random_state=42, n_estimators=parameters["n_estimators"], n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def eval_one(min_samples_leaf, n_estimators): log("min_samples_leaf: " + str(min_samples_leaf) + ", n_estimators: " + str(n_estimators)) all_observations = [] all_pred_ALL = [] for group in range(0, len(groups)): trainStations = [] for i in range(0, len(groups)): if i != group: trainStations.extend(groups[i]) testStations = groups[group] train_station_set = set([float(s) for s in trainStations]) test_station_set = set([float(s) for s in testStations]) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, all_features, "target") model = RandomForestRegressor(min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_ALL = model.predict(testX) rmse = rmseEval(testY, prediction_ALL)[1] log("\tALL rmse: " + str(rmse)) all_observations.extend(testY) all_pred_ALL.extend(prediction_ALL) rmse = rmseEval(all_observations, all_pred_ALL)[1] log("\tALL rmse:" + str(rmse)) return rmse
def eval_one(step): if step in cached_results: return cached_results[step] eval_features = [] for i in range(0, len(all_features)): if step[i]: eval_features.append(all_features[i]) all_predictions = [] all_observations = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, eval_features, "target") model = RandomForestRegressor(min_samples_leaf = 2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) predictions = model.predict(testX) all_observations.extend(testY) all_predictions.extend(predictions) rmse = rmseEval(all_observations, all_predictions)[1] cached_results[step] = rmse # save down the cached result cache_output = open(CACHE_FILE, "a") step_list = [str(s) for s in step] step_str = ",".join(step_list) cache_output.write(str(rmse) + ";" + step_str + "\n") cache_output.close() return rmse
def RF_ST(trainFileName, testFilename): trainData = ld.LoadData_DATA_ST(trainFileName) testData = ld.LoadData_DATA_ST(testFilename) store = ['1', '2', '3', '4', '5'] res = [] for i in store: train_X = [] train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [] items = [] context = testData[i] for array in context: items.append((array[0], array[1])) array = [float(x) for x in array[2:]] test_X.append((array[2:])) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\ fit(train_X,train_y) pred_y = clf.predict(test_X) for i in range(len(pred_y)): res.append([items[i][0], items[i][1], '%.4f' % max(pred_y[i], 0)]) return res
def RF_ST(trainFileName,testFilename): trainData = ld.LoadData_DATA_ST(trainFileName) testData = ld.LoadData_DATA_ST(testFilename) store = ['1','2','3','4','5'] res = [] for i in store: train_X = [];train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [];items = [] context = testData[i] for array in context: items.append((array[0],array[1])) array = [float(x) for x in array[2:] ] test_X.append((array[2:])) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto').\ fit(train_X,train_y) pred_y = clf.predict(test_X) for i in range(len(pred_y)): res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)]) return res
def randomforestregressor(self, testlen, ntrain, ntrees, nodes): hsmadata = self.hsmadata dates = pd.Series(hsmadata['date'].unique()).sort_values() dates.index = range(0, len(dates)) ntest = len(dates) // testlen hsma = pd.DataFrame() for i in range(ntrain, ntest): traindata = hsmadata[ (hsmadata['date'] >= dates[(i - ntrain) * testlen]) & (hsmadata['date'] < dates[i * testlen - self.day])].copy() testdata = hsmadata[(hsmadata['date'] >= dates[i * testlen]) & ( hsmadata['date'] < dates[(i + 1) * testlen])].copy() traindata = traindata.iloc[:, 2:] traindatax = traindata.drop(['closeratio'], 1) traindatay = traindata['closeratio'] testdatax = testdata[traindatax.columns] treemodel = RandomForestRegressor( n_estimators=ntrees, min_samples_split=nodes * 2, min_samples_leaf=nodes) treemodel.fit(traindatax, traindatay) testdata['predratio'] = treemodel.predict(testdatax) hsma = pd.concat([hsma, testdata], ignore_index=True) return (hsma)
def post(self): # upload audio file in server voice = self.request.files["audio"][0] extn = os.path.splitext(voice['filename'])[1] fnm = os.path.splitext(voice['filename'])[0] cname = str(uuid.uuid4()) + extn fh = open(__UPLOADS__ + cname, 'w') fh.write(voice['body']) fh.close() # get features from the audio file attr = getAttributes(cname) fdf = mongoTolist(False) train = fdf[:,:-1] target = fdf[:,-1] #RandomForest Regression rf = RandomForestRegressor(n_estimators = 506, n_jobs = -1) rf.fit(train, target) updrs_val = rf.predict([attr]) attr.append(updrs_val[0]) # get the theta from database theta = list(db.theta.find({})) theta1 = theta[0]["theta1"] theta2 = theta[1]["theta2"] # check is the person has Parkinson's Disease isParkinson = octave.classify(theta1, theta2, np.array(attr)) self.render("output.html", ipk = isParkinson, updrs = updrs_val[0])
class RandomForestRegressorImpl(): def __init__(self, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start} def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def calcRandomForest(channels_training, channels_testing, target_training, target_testing): clf = RandomForestRegressor(n_estimators=500, max_features=len(channels_training[0])) clf = clf.fit(channels_training, target_training) predictions = clf.predict(channels_testing) comp = [predictions, target_testing] return clf, comp
def randomForest(trainFeatures, trainResponses, testFeatures, maxFeatures = 'log2', nTree=100): ## Settings of random forests regressor regModel = RandomForestRegressor(n_estimators=nTree, max_features=maxFeatures) ## Train the random forests regressor regModel.fit(trainFeatures, trainResponses) ## Prediction testResponsesPred = regModel.predict(testFeatures) return testResponsesPred
def evalTrainStationTestStation(trainStation, testStation, features): trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target") _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target") model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX2) rmse = rmseEval(testY2, prediction)[1] print("Training on station " + str(trainStation) + ", applying on station " + str(testStation) + ": rmse: " + str(rmse)) return rmse
def RF_ALL(trainFileName,testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i],'all','%.4f'%max(pred_y[i],0)]) return res
def RF_ALL(trainFileName, testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = RandomForestRegressor(n_estimators=100,criterion='mse', max_depth=None,max_features='auto',bootstrap=True).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i], 'all', '%.4f' % max(pred_y[i], 0)]) return res
def ML(features, targets, fig_num): X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.8, random_state=42) #Preprocessing scaler = StandardScaler().fit(X_train) #Scale and construct new dataframes from sklearn numpy array output X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values, columns=X_train.columns.values) X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values) rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0) rf.fit(X_train, y_train) predicted_test = rf.predict(X_test) rf.fit(X_train_scaled, y_train) predicted_test_scaled = rf.predict(X_test_scaled) test_score = r2_score(y_test, predicted_test) test_score_scaled = r2_score(y_test, predicted_test_scaled) spearman = spearmanr(y_test, predicted_test) spearman_scaled = spearmanr(y_test, predicted_test_scaled) pearson = pearsonr(y_test, predicted_test) pearson_scaled = pearsonr(y_test, predicted_test_scaled) print( "R-squared: %1.4f, Scaled R-squared: %1.4f, \n Spearman: %1.4f, Scaled Spearman: %1.4f \n Pearson: %1.4f, Scaled Pearson: %1.4f" % (test_score, test_score_scaled, spearman[0], spearman_scaled[0], pearson[0], pearson_scaled[0])) plt.figure(fig_num) plt.scatter(y_test, predicted_test, label='unscaled') plt.scatter(y_test, predicted_test_scaled, label='scaled') plt.legend()
def RandomForest(x_train,y_train,x_test,y_test): degree = [1,2,3,4,7] result = {} rmse_list = [] for d in degree: params = {'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1,'warm_start':True} clf = RandomForestRegressor(**params) clf.fit(x_train[:, np.newaxis], y_train) y_predict = clf.predict(x_test[:, np.newaxis]) rmsevalue = rmse(y_test,y_predict) result[rmsevalue] = [y_predict,d] rmse_list.append(rmsevalue) rmseMin = min(rmse_list) return rmsevalue,result[rmseMin]
def RandomForest(weiboid, x_train, y_train, x_test, y_test, d): params = { 'n_estimators': 1000, 'max_depth': d, 'min_samples_split': 1, 'warm_start': True, 'oob_score': True } clf = RandomForestRegressor(**params) clf.fit(x_train, y_train) y_predict = clf.predict(x_test) r = rmse(y_test, y_predict) #fig(weiboid,y_test,y_predict) return y_predict, r
def train_model(X_train, y_train): print("training the model ...") rf = RandomForestRegressor(n_estimators=500, max_depth=5, n_jobs=-1, verbose=2) rf.fit(X_train, y_train) y_pred_train = rf.predict(X_train) print(".. training RMSE : {:0.3f} %".format( mean_squared_error(y_train, y_pred_train) * 100)) #print(".. training R2 : {:0.3f} %".format(r2_score(y_train,y_pred_train)*100)) print(".. training MAE : {:0.3f} %".format( mean_absolute_error(y_train, y_pred_train) * 100)) return rf
def doPrediction(locations, data, columns, features, columns2, outputFileName): predictionData = {} for c in columns2: predictionData[c] = [] # modelling for location in locations: trainX, testX, trainY, testY, dataY = splitDataForXValidation( location, "location", data, features, columns, "target") print("\tT+W #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=2, n_estimators=650, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) for c in columns2: if c == 'prediction': predictionData[c].extend(prediction) else: predictionData[c].extend(dataY[c]) for c in predictionData: print("\t" + c + " -> #" + str(len(predictionData[c]))) rmse = rmseEval(predictionData['target'], predictionData['prediction'])[1] print("overall RMSE: " + str(rmse)) print("Writing out results...") output = open(outputFileName, 'w') output.write(','.join([str(x) for x in columns2])) output.write("\n") for i in range(0, len(predictionData['target'])): output.write(str(predictionData[columns2[0]][i])) for j in range(1, len(columns2)): output.write(",") output.write(str(predictionData[columns2[j]][i])) output.write("\n") output.close() print("Done...")
def perform_random_forest_regressor(train_set, train_target, test_set, predictors, estimators=10, depth=None, splits=2): alg = RandomForestRegressor(random_state=1) alg.fit(train_set[predictors], train_target) #importances = alg.feature_importances_ #print("Original ",numpy.argsort(importances)) #indices = numpy.argsort(importances)[::-1] #print (" importances ",importances) #print (" indices ",indices) #for f in range(train_set.shape[1]-2): # print("%2d) %-*s %f" % (f+1,30,predictors[indices[f]], # importances[indices[f]])) predictions = alg.predict(test_set[predictors]) return predictions;
def main(train_file='train.csv', test_file='test.csv', output_file='predict.csv'): print "Loading data..." train_data = pd.read_csv(train_file) test_data = pd.read_csv(test_file) y = np.array(train_data[["ACTION"]]) #X = np.array(train_data.ix[:,1:-1]) # Ignores ACTION, ROLE_CODE X = np.array(train_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY", "ROLE_DEPTNAME", "ROLE_CODE"]]) X_test = np.array(test_data[["RESOURCE","MGR_ID", "ROLE_ROLLUP_1", "ROLE_ROLLUP_2", "ROLE_DEPTNAME", "ROLE_FAMILY_DESC", "ROLE_FAMILY","ROLE_DEPTNAME", "ROLE_CODE"]]) # Ignores ID, ROLE_CODE SEED = 4 #clf = DecisionTreeClassifier(criterion="entropy").fit(X,y) clf = RandomForestRegressor(n_estimators=300, min_samples_split=15, min_density=0.1,compute_importances=True).fit(X,y) print clf.feature_importances_ #Try feature selection mean_auc = 0.0 n = 10 for i in range(n): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.10, random_state=i*SEED) # if you want to perform feature selection / hyperparameter # optimization, this is where you want to do it # train model and make predictions clf.fit(X_train, y_train) preds = clf.predict(X_cv) # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds, pos_label=1) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) mean_auc += roc_auc print "Mean AUC: %f" % (mean_auc/n) predictions = clf.predict_(X_test) #print predictions #print 'Writing predictions to %s...' % (output_file) create_test_submission(output_file, predictions) return 0
def rf(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014, data2013, data2014): columns = [] for c in data2013: columns.append(c) columns.remove("location") columns.remove("timestamp") columns.remove("target") X = [] y = [] for i in range(0, len(data2013["target"])): timestamp = str(int(data2013["timestamp"][i])) weekC = timestampWeekCategory[timestamp] if int(weekC) >= week: y.append(data2013["target"][i]) x = [] for c in columns: x.append(data2013[c][i]) X.append(x) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(X, y) # print(str(len(X))) X = [] y = [] for i in range(0, len(data2014["target"])): y.append(data2014["target"][i]) x = [] for c in columns: x.append(data2014[c][i]) X.append(x) prediction = model.predict(X) rmse = rmseEval(y, prediction) return rmse
def eval_one(features): all_predictions = [] all_observations = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf=2, random_state=42, n_estimators=650, n_jobs=-1) model.fit(trainX, trainY) predictions = model.predict(testX) all_observations.extend(testY) all_predictions.extend(predictions) rmse = rmseEval(all_observations, all_predictions)[1] log("\tRMSE: " + str(rmse))
def predict_per_cpu_full(): data, target = load_data() data, target, labels = normalize_data(data, target) data = data[['C0', 'cpuFull']] data['target'] = target split_by_types = dict() cpu_groups = data.groupby('cpuFull') for name, group in cpu_groups: X_train, X_test, y_train, y_test = train_test_split(group['C0'].reshape(-1, 1), group['target']) split_by_types[str(name)] = { 'train': { 'data': X_train, 'target': y_train }, 'test': { 'data': X_test, 'target': y_test } } # print split_by_types summ = 0.0 for cpu, data_set in split_by_types.iteritems(): plt.figure() # reg = SGDRegressor(loss='huber', n_iter=100, alpha=0.0) reg = RandomForestRegressor(n_estimators=5) reg.fit(data_set['train']['data'], data_set['train']['target']) test_data = data_set['test']['data'] y_pred = reg.predict(test_data) print mape(data_set['test']['target'], y_pred), cpu plt.scatter(test_data, data_set['test']['target'], s=3, color='g', label='actual') plt.scatter(test_data, y_pred, s=3, color='r', label='predicted') plt.legend(loc='upper left') plt.ylabel('mul time') plt.title('Category: {}'.format(cpu)) plt.savefig('imgs/{}.png'.format(cpu))
def evaluateFeatures(vector, features, data): featureToUse = [] for i in range(len(vector)): if vector[i] == 1: featureToUse.append(features[i]) combinedRmse = [] # modelling for location in locationValues: trainX, testX, trainY, testY = splitDataForXValidation2( location, "location", data, featureToUse, "target") model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction) combinedRmse.append(rmse[1]) # calculate avg rmse avgRmse = 0.0 for rmse in combinedRmse: avgRmse = avgRmse + rmse avgRmse = avgRmse / len(combinedRmse) return avgRmse
def learn_rfr(str_json): param = json.loads(str_json) m_n_estimators = param["n_estimators"] m_criterion = param["criterion"] m_random_state = param["random_state"] predict_col = param["predict_col"] features = param["features"] print m_n_estimators print m_criterion print m_random_state for feature in features: print feature df = pd.read_csv('/home/kaka/Data/building/building.csv', header=0) df_ratio = int(len(df) * 0.7) df_train = df.iloc[0:df_ratio, 0::] df_test = df.iloc[df_ratio:, 0::] actual_data = np.array(df.iloc[df_ratio:, predict_col]).astype(float) sample_data = np.array(df_train.iloc[0::, predict_col]).astype(str) set_train = df_train.loc[0::, features[0::]].astype(str) set_test = df_test.loc[0::, features[0::]].astype(str) model_rf = RandomForestRegressor(n_estimators=3, criterion='mse', random_state=0) model_rf = model_rf.fit(set_train, sample_data) predicted_data = model_rf.predict(set_test).astype(float) np.savetxt("/home/kaka/Data/building/building_predicted.csv", predicted_data, delimiter=",") np.savetxt("/home/kaka/Data/building/building_actual.csv", actual_data, delimiter=",")
X_test, y_test = OrganizeData(nucleus, 'test') # Feature scaling X_train_scaled = preprocessing.scale(X_train) X_test_scaled = preprocessing.scale(X_test) # Set the parameters for the random forest estimator estimator = RandomForestRegressor(n_estimators=50, max_features=16, max_depth=25, min_samples_split=5, min_samples_leaf=5, random_state=0) # Build the random forest of regression trees from the training set estimator = estimator.fit(X_train_scaled,y_train) print estimator.score(X_train_scaled,y_train) print estimator.score(X_test_scaled,y_test) # Predict regression target for the test set predicted = estimator.predict(X_train_scaled) cc = np.corrcoef(y_train,predicted) print cc print estimator #my_plotting.simple_plot_overlay(y_train,predicted) predicted = estimator.predict(X_test_scaled) cc = np.corrcoef(y_test,predicted) print cc print estimator #my_plotting.simple_plot_overlay(y_test,predicted) # score = cross_val_score(estimator, X_train, y_train) # print score
train['LogSale'] = np.log(train.Sales+1) train=pd.merge(train, store, on="Store") test = pd.merge(test, store, on="Store") processdata(train) processdata(test) repeat = 1 #print('Splitting data...') for i in range(repeat): features = [col for col in test.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes','Id']] ##!!!for submission should be test.columns!!! # features = ['Store', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo', 'Promo2',\ # 'Promo2SinceWeek', 'Promo2SinceYear', 'SchoolHoliday', 'DayOfWeek', 'mon', 'day', 'year', 'StoreType', 'Assortment'] # ^^ features taken from xgb model on Kaggle rf = RandomForestRegressor(n_estimators=100) print('Starting training...') rf.fit(train[features],train.LogSale) # train['mypred'] = rf.predict(train[features]) # train['mypred'] = np.expm1(train.mypred) # train_error = rmspe(train[train.Sales>0].Sales,train[train.Sales>0].mypred) # print(train_error) test['mypred'] = rf.predict(test[features]) test['mypred'] = np.exp(test['mypred'])-1 test['Sales'] = test.mypred test[[ 'Id', 'Sales' ]].to_csv('rand_for_kag_v4-8.csv', index = False )
def compute_metrics_with_RandomForest(latents, factors, err_fn=nrmse, params={ "n_estimators": 10, "max_depth": 8 }, cont_mask=None): """ :param latents: (N, z_dim). They use E_q(z|x)[z] :param factors: (N, K) :param err_fn: Error function :param params: Parameters of LASSO :return: """ assert len(latents.shape) == len(factors.shape) == 2, \ "'latents' and 'factors' must be 2D arrays!" assert len(latents) == len( factors), "'latents' and 'factors' must have the same length!" num_factors = factors.shape[1] R = [] train_errors = [] if not cont_mask: cont_mask = [True] * num_factors else: assert len(cont_mask) == num_factors, "len(cont_mask)={}".format( len(cont_mask)) print( "Training Random Forest regressor for {} factors!".format(num_factors)) for k in tqdm(range(num_factors)): if cont_mask: print("Factor {} is continuous. Process it!".format(k)) # (N, ) factors_k = factors[:, k] model = RandomForestRegressor(**params) model.fit(latents, factors_k) # (N, ) factors_k_pred = model.predict(latents) # Scalar train_errors.append(err_fn(factors_k_pred, factors_k)) # Get the weight of the linear regressor, whose shape is (num_latents, 1) R.append(np.abs(model.feature_importances_[:, None])) else: print("Factor {} is not continuous. Do not process it!".format(k)) # (num_latents, num_factors) R = np.concatenate(R, axis=1) assert R.shape[1] == np.sum(np.cast(cont_mask, dtype=np.int32)), \ "R.shape={} while #cont={}".format( R.shape[1], np.sum(np.cast(cont_mask, dtype=np.int32))) # Disentanglement: (num_latents,) disentanglement_scores = entropic_scores(R.T) c_rel_importance = np.sum(R, axis=1) / np.sum( R) # relative importance of each code variable assert 1 - 1e-4 < np.sum(c_rel_importance) < 1 + 1e-4, \ "c_rel_importance: {}".format(c_rel_importance) disentanglement = np.sum(disentanglement_scores * c_rel_importance) # Completeness completeness_scores = entropic_scores(R) completeness = np.mean(completeness_scores) # Informativeness train_avg_error = np.mean(train_errors) results = { 'importance_matrix': R, 'disentanglement_scores': disentanglement_scores, 'disentanglement': disentanglement, 'completeness_scores': completeness_scores, 'completeness': completeness, 'train_errors': train_errors, 'train_avg_error': train_avg_error, } return results
n_samples, n_features = 100, 5 y = np.random.randn(n_samples) X = np.random.randn(n_samples, n_features) z = np.random.randn(20, 5) z1 = np.random.randn(20) clf.fit(X, y) clf.predict(z) ######################### from sklearn.ensemble.forest import RandomForestRegressor regressor = RandomForestRegressor() parameters = [{"n_estimators": [250, 500, 1000, 2000]}] # Returns the best configuration for a model using crosvalidation # and grid search import time regressor = RandomForestRegressor(n_estimators=300, min_samples_split=1, max_features=67) regressor.fit(train_np, energy) pred = regressor.predict(test_np) print explained_variance_score(energy_test, pred) print mean_squared_error(energy_test, pred) r2_score(energy_test, pred) ##prediction comparison comp = pd.read_csv("H:/bee-efficiency/cisco presentation/pred.csv")
clf.predict(z) ######################### from sklearn.ensemble.forest import RandomForestRegressor regressor = RandomForestRegressor() parameters = [{"n_estimators": [250, 500, 1000,2000]}] # Returns the best configuration for a model using crosvalidation # and grid search import time regressor = RandomForestRegressor(n_estimators=300, min_samples_split=1,max_features=67) regressor.fit(train_np,energy) pred=regressor.predict(test_np) print explained_variance_score(energy_test,pred) print mean_squared_error(energy_test,pred) r2_score(energy_test,pred) ##prediction comparison comp = pd.read_csv("H:/bee-efficiency/cisco presentation/pred.csv")
#del household['ST'] #del household['DIVISION'] #del household['ELEP'] #if 'CDD' in household.columns: # del household['CDD'] # del household['HDD'] X = household.as_matrix() X = np.nan_to_num(X) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf = RandomForestRegressor(n_estimators = 10, n_jobs = 8) clf.fit(X_train, y_train) print(metrics.mean_squared_error(y_test, clf.predict(X_test))) print(metrics.r2_score(y_test, clf.predict(X_test))) predictions = clf.predict(X_test)[:50] ''' features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features) ''' pums = pd.read_csv("../joined_weather.csv") pums = pums.sample(1000) pums_puma_vector = pums.as_matrix(columns = ['PUMA']) left_matrix = pums[['PUMA', 'WGTP', 'SERIALNO']] del pums['PUMA'] del pums['WGTP'] del pums['SERIALNO']
def _2011x2011_ (data_path): ##### LOADING ##### sys.stdout.write("Loading data... ") # Load data from .csv file with open(data_path+'_X.csv') as data_file: reader = csv.reader(data_file) # Initialize lists for data and class labels data =[] # skip header next(reader, None) # For each row of the csv file for row in reader: data.append([float(x) for x in row]) with open(data_path+'_y.csv') as labels_file: reader = csv.reader(labels_file) # Initialize lists for data and class labels val_ind =[] # skip header next(reader, None) # For each row of the csv file for row in reader: val_ind.append(row) sys.stdout.write("done\n") ##### TRAINING ##### # splitting data_train, data_test, val_ind_train, val_ind_test \ = skl.cross_validation.train_test_split(data, val_ind, test_size=0.4, random_state=42) # Cutting date/ ASS/ number value from labels date_train = [x[0] for x in val_ind_train] # ASS_train = [x[1] for x in val_ind_train] val_train = [float(x[1]) for x in val_ind_train] date_test = [x[0] for x in val_ind_test] # ASS_test = [x[1] for x in val_ind_test] val_test = [float(x[1]) for x in val_ind_test] sys.stdout.write("Training regressor... ") reg = RandomForestRegressor() # reg = skl.tree.DecisionTreeRegressor() # reg = skl.linear_model.LinearRegression() reg.fit(data_train, val_train) sys.stdout.write("done\n") ##### PREDICTION ##### sys.stdout.write("Predicting... ") val_predicted = reg.predict(data_test) sys.stdout.write("done\n") ##### ERROR ##### df = pd.DataFrame() df['date'] = pd.to_datetime(date_test) # df['ASS'] = ASS_test df['original'] = val_test df['predicted'] = val_predicted.tolist() df = df.set_index('date') # df = df.loc[df['ASS'] == 'CAT'] # one example df.info() df.plot() plt.show() print "MSE : " + str(mean_squared_error(val_test,val_predicted.tolist()))
def predict(self, X): return RandomForestRegressor.predict(self, X)[:, numpy.newaxis]
# In[12]: Rows = np.random.choice(Train.index.values, 400000) Sampled_Train = Train.ix[Rows] Sample_Train_Target = Train_Target.ix[Rows] # RF.fit(Sampled_Train, Sample_Train_Target) RF.fit(Train, Train_Target) # In[ ]: print 'Predict!' Test_Predict = RF.predict(Test.as_matrix()) # In[ ]: print Test_Predict.shape # In[ ]: from collections import OrderedDict Submission = pd.DataFrame(data = OrderedDict([('Id', Test_ID), ('Sales', Test_Predict)])) Submission.to_csv('Submission_RF.csv', index = False)
store = store.drop("Assortment", 1).join( pd.get_dummies(store["Assortment"]).rename(columns=lambda x: "Assortment" + "_" + str(x)) ) train["StateHoliday"] = [mychange(x) for x in train.StateHoliday] test["StateHoliday"] = [mychange(x) for x in test.StateHoliday] train = train.drop("StateHoliday", 1).join( pd.get_dummies(train["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x)) ) test = test.drop("StateHoliday", 1).join( pd.get_dummies(test["StateHoliday"]).rename(columns=lambda x: "StateHoliday" + "_" + str(x)) ) train = pd.merge(train, store, on="Store") test = pd.merge(test, store, on="Store") repeat = 1 print("Splitting data...") for i in range(repeat): features = [col for col in test.columns if col not in ["Customers", "Sales", "Date", "LogSale", "datetimes", "Id"]] rf = RandomForestRegressor(n_estimators=100) print("Starting training...") rf.fit(train[features].fillna(-1), train.LogSale) test["mypred"] = rf.predict(test[features].fillna(-1)) test["mypred"] = np.exp(test["mypred"]) - 1 test["Sales"] = test.mypred test[["Id", "Sales"]].to_csv("rand_for_kag_v4-9.csv", index=False)
'hour', 'day_of_week', 'month', 'bank_holiday', 'race_day', 'winddirection', 'windspeed', 'temperature', 'rain', 'pressure' ] for location in locations: print("location: " + str(location)) # save down trainX, trainY, testX, testY trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columns, "target") print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPrediction = model.predict(testX) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\tRFR+All rmse: " + str(testRmse)) trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columnsTW, "target") print("\t#train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) testPrediction = model.predict(testX) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\tRFR+TW rmse: " + str(testRmse))
return False return True #return column in ['BDSP', 'RMSP', 'HFL', 'BLD', 'AGEP', 'NP', 'YBL', 'HINCP', 'HDD', 'CDD'] household = household[[column for column in household.columns if select_column(column)]] X = household.as_matrix() print(household.columns) #X = household.as_matrix() with open("kwh_model_features.json", "w") as f: json.dump(list(household.columns), f, indent = True) print(y) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf = RandomForestRegressor(n_estimators = 50, n_jobs = 8) clf.fit(X_train, y_train) print(y_test[:100]) print(np.sqrt(metrics.mean_squared_error(y_test, clf.predict(X_test)))) print(metrics.r2_score(y_test, clf.predict(X_test))) features = sorted(zip(household.columns, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features) with open("kwh_model.pkl", 'wb') as f: pickle.dump(clf, f)
testStations = groups[group] log("\ttrainStations: " + str(trainStations)) log("\ttestStations: " + str(testStations)) train_station_set = set([float(s) for s in trainStations]) test_station_set = set([float(s) for s in testStations]) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, tw_features, "target") model = RandomForestRegressor(min_samples_leaf=29, n_estimators=64, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_TW = model.predict(testX) rmse = rmseEval(testY, prediction_TW)[1] log("\tTW rmse: " + str(rmse)) all_observations.extend(testY) all_pred_TW.extend(prediction_TW) trainX, testX, trainY, testY = splitDataForXValidation( train_station_set, test_station_set, "location", data, twa_features, "target") model = RandomForestRegressor(min_samples_leaf=29, n_estimators=64, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction_TWA = model.predict(testX) rmse = rmseEval(testY, prediction_TWA)[1]
del household['KWH'] X_columns = [column for column in household.columns if column != "ELEP"] X = household.as_matrix(columns = X_columns) y = [label[0] for label in household.as_matrix(columns = ["ELEP"])] #print(y) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf = RandomForestRegressor(n_estimators = 100, n_jobs = 8) clf.fit(X_train, y_train) print(y_test[:100]) print(metrics.mean_squared_error(clf.predict(X_test), y_test)) print(metrics.r2_score(y_test, clf.predict(X_test))) features = sorted(zip(X_columns, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features) #fill spaces in ELEP normalized_pums = pd.read_csv("../joined_weather.csv", delimiter = ',') print('pums shape', normalized_pums.shape) with open("../vectorized_puma_regions/puma_list.json") as f: puma_mapping = json.load(f) reverse_puma_map = {} for key, value in puma_mapping.items(): reverse_puma_map[int(value)] = int(key)
### RigeLinearCV = linear_model.RidgeCV(cv=10) rcv = RigeLinearCV.fit(x_train, y_train) y_pre_rcv = rcv.predict(x_val) ### params_rf = { 'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 2, 'n_jobs': 4 } rf = RandomForestRegressor(**params_rf) rf.fit(x_train, y_train) y_pre_rf = rf.predict(x_val) ### y_pre_diff = mean_normal_weekend_diff(Y[-14:-7], xday[-28:-14], xweekend[-28:-14], -7, 0) ### #Y_test.append(y_test) #y_pre_diff = mean_normal_weekend_diff(Y,xday,xweekend,-21,-14) ### loss_rcv = Evaluation([y_pre_rcv], [y_val]) loss_rf = Evaluation([y_pre_rf], [y_val]) loss_diffmean = Evaluation([y_pre_diff], [y_val]) union = {loss_rcv: 1, loss_rf: 2, loss_diffmean: 3} minloss = min(union.keys())
def evalColumns(columns): overallY = [] overallPred = [] for location in locations: location2s = [l for l in locations if l != location] print("Location: " + str(location) + ", location2: " + str(location2s)) # generating testPreds testPreds = {} for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) testPreds[tag] = prediction trainPreds = defaultdict(list) for datagroup in topDatagroups: tag, features = getTagAndFeatures(datagroup) print("\ttag: " + str(tag) + ", features: " + str(features)) for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target") model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42) model.fit(trainX1, trainY1) train1Prediction = model.predict(trainX1) train2Prediction = model.predict(trainX2) testPrediction = model.predict(testX) train1Rmse = str(rmseEval(trainY1, train1Prediction)[1]) train2Rmse = str(rmseEval(trainY2, train2Prediction)[1]) testRmse = str(rmseEval(testY, testPrediction)[1]) print("\t\ttrain1 rmse: " + train1Rmse) print("\t\ttrain2 rmse: " + train2Rmse) print("\t\ttest rmse: " + testRmse) for x in train2Prediction: trainPreds[tag].append(x) # get combined train2y combinedTrain2Y = [] for location2 in location2s: trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") combinedTrain2Y = combinedTrain2Y + trainY2 # calculate labels labelTrain2Y = [] for i in range(0, len(combinedTrain2Y)): bestModel = 0 bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i]) for j in range(0, len(topTags)): tag = topTags[j] modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i]) if modelAbs < bestAbs: bestAbs = modelAbs bestModel = j labelTrain2Y.append(bestModel) # generating testX _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target") # trainX2 tX2 = [] for location2 in location2s: _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target") for row in trainX2: tX2.append(row) for tag in topTags: for i in range(0, len(trainPreds[tag])): tX2[i].append(trainPreds[tag][i]) reducedTrainX2 = [] for d in tX2: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTrainX2.append(reducedD) model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15) model.fit(reducedTrainX2, labelTrain2Y) for tag in topTags: for i in range(0, len(testPreds[tag])): testX[i].append(testPreds[tag][i]) reducedTestX = [] for d in testX: reducedD = [] for i in range(0, len(all_columns)): if columns[i]: reducedD.append(d[i]) reducedTestX.append(reducedD) pred = model.predict(reducedTestX) finalPrediction = [] for i in range(0, len(testY)): p = testPreds[topTags[pred[i]]][i] finalPrediction.append(p) rmse = str(rmseEval(testY, finalPrediction)[1]) print("\tRMSE: " + str(rmse)) for x in testY: overallY.append(x) for x in finalPrediction: overallPred.append(x) rmse = rmseEval(overallPred, overallY)[1] return rmse
def train_many_test_many(train_data_paths, test_data_paths, test_data_spec): test_run = 0 if test_data_spec == TEST_ON_FAIL_ONLY: test_start_cycle = 0 test_end_cycle = 0 test_sample_freq = 0 # don't use any "good" examples elif test_data_spec == TEST_ON_FAIL_PLUS_CYCLE_ZERO: test_start_cycle = 0 test_end_cycle = 1 test_sample_freq = 1000 # use only "good" example from first cycle elif test_data_spec == TEST_ON_FAIL_PLUS_CYCLE_MAX: test_start_cycle = 9999999 test_end_cycle = 9999999 test_sample_freq = 1000 # use only "good" example from "max" cycle elif test_data_spec == TEST_ON_TRAIN_SPEC: test_start_cycle = start_cycle test_end_cycle = end_cycle test_sample_freq = sample_freq else: sys.err.write( "Invalid test_data_spec '%d'. Must be one of: TEST_ON_FAIL_ONLY, TEST_ON_FAIL_PLUS_CYCLE_ZERO, TEST_ON_FAIL_PLUS_CYCLE_MAX\n" % test_data_spec) # print output headers print "PERFORMANCE\test_data_spec\ttest_path\ttest_run\tpiston_param\tdensity_param\trmse\tfp\tfn\tnum_instances\truntime_secs" # load pickled model if enable_load_pickled_model: print 'Using pre-trained model from: randomforest.pkl. This may take a couple of minutes...' with open('randomforest.pkl', 'rb') as f: rand_forest = cPickle.load(f) else: train = None start = time.time() train_list = [] for train_data_path in train_data_paths: print train_data_path train_next = get_learning_data(train_data_path, start_cycle, end_cycle, sample_freq, decay_window) train_list.append(train_next) train = np.concatenate(train_list, axis=0) print "training data: ", train.shape end = time.time() print "TIME load training data: ", end - start # Train the random forest train_X = train[:, 0:-1] train_Y = np.ravel(train[:, [-1]]) start = time.time() rand_forest = RandomForestRegressor(n_estimators=NumTrees, n_jobs=parallelism, random_state=rand_seed) rand_forest.fit(train_X, train_Y) end = time.time() print "TIME train: ", end - start # output feature importance if enable_feature_importance: output_feature_importance(rand_forest, train_data_paths[0]) # pickle model for future use if enable_save_pickled_model: print "Writing random forest model to file: randomforest.pkl. This may take a couple of minutes..." with open('randomforest.pkl', 'wb') as f: cPickle.dump(rand_forest, f) print "Wrote random forest model to file: randomforest.pkl" for test_path in test_data_paths: start = time.time() piston_param = 0 density_param = 0 try: (index, test) = get_learning_data_for_run(test_path, test_start_cycle, test_end_cycle, test_sample_freq, decay_window, test_run) print "test data: ", test.shape # Check results on cv set test_X = test[:, 0:-1] test_Y = np.ravel(test[:, [-1]]) cv_predict = rand_forest.predict(test_X) #decision_boundary = min(cv_predict) decision_boundary = 4e-6 RMSE = np.sqrt(sum(pow(test_Y - cv_predict, 2)) / test_Y.size) #err = sum(cv_predict - test_Y) / test_Y.size #pos_indices = [i for i, x in enumerate(test_Y) if x > 0] #neg_indices = [i for i, x in enumerate(test_Y) if x == 0] #err_on_pos = sum(np.array([cv_predict[i] for i in pos_indices]) - np.array([test_Y[i] for i in pos_indices])) / len(pos_indices) #err_on_neg = sum(np.array([cv_predict[i] for i in neg_indices]) - np.array([test_Y[i] for i in neg_indices])) / len(neg_indices) # calculate false positives and false negatives fp = fn = 0 for i in range(len(test_Y)): if test_Y[i] == 0 and cv_predict[i] > decision_boundary: fp += 1 elif test_Y[i] > 0 and cv_predict[i] <= decision_boundary: fn += 1 end = time.time() if enable_print_predictions: for i in range(len(test_Y)): print test_Y[i], cv_predict[i] if "piston" in test_path: piston_offset = test_path.find("piston") + len("piston") piston_param = int(test_path[piston_offset:piston_offset + 3]) density_offset = test_path.find("density") + len("density") density_param = float(test_path[density_offset:density_offset + 4]) print "PERFORMANCE\t%d\t%s\t%d\t%d\t%.2f\t%.15f\t%d\t%d\t%d\t%d" % ( test_data_spec, test_path, test_run, piston_param, density_param, RMSE, fp, fn, len(test_Y), round(end - start)) sys.stdout.flush() except: end = time.time() print "PERFORMANCE\t%d\t%s\t%d\t%d\t%.2f\t%.15f\t%d\t%d\t%d\t%d" % ( test_data_spec, test_path, test_run, piston_param, density_param, 0, 0, 0, 0, round(end - start))
class MLCms: """ """ def __init__(self, config_file=''): # Parse config file self.parser = SafeConfigParser() self.parser.read(config_file) # machine learning specific variables self.classify = constants.DO_CLASSIFICATION # Regress or classify? self.vars_features = constants.fixed_vars self.vars_target = constants.ML_TARGETS if self.classify: self.var_target = constants.ML_TARGETS self.task = 'classification' self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) else: self.var_target = constants.ML_TARGETS self.task = 'regression' self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) # SVR() # Get path to input self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl # Output directory is <dir>_<classification>_<2014> self.path_out_dir = constants.out_dir utils.make_dir_if_missing(self.path_out_dir) # Model pickle self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features' def output_model_importance(self, gs, name_gs, num_cols): """ :param gs: :param name_gs: :param num_cols: :return: """ rows_list = [] name_vars = [] feature_importance = gs.best_estimator_.named_steps[name_gs].feature_importances_ importances = 100.0 * (feature_importance / feature_importance.max()) std = np.std([tree.feature_importances_ for tree in self.model.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Store feature ranking in a dataframe for f in range(num_cols): dict_results = {'Variable': self.vars_features[indices[f]], 'Importance': importances[indices[f]]} name_vars.append(self.vars_features[indices[f]]) rows_list.append(dict_results) df_results = pd.DataFrame(rows_list) num_cols = 10 if len(indices) > 10 else len(indices) # Plot upto a maximum of 10 features plot.plot_model_importance(num_bars=num_cols, xvals=importances[indices][:num_cols], std=std[indices][:num_cols], fname=self.task + '_importance_' + self.crop, title='Importance of variable (' + self.country + ' ' + self.crop_lname + ')', xlabel=name_vars[:num_cols], out_path=self.path_out_dir) df_results.to_csv(self.path_out_dir + os.sep + self.task + '_importance_' + self.crop + '.csv') def get_data(self): """ :return: """ df = pd.read_csv(self.path_inp) cols = [col for col in df.columns if col not in self.vars_features] # cols.extend(['DI', 'PI']) # Add information on PI and DI of soils # iterate over each row, get lat and lon # Find corresponding DI and PI lat_lons = zip(df['Long_round'], df['Lat_round']) vals_di = [] vals_pi = [] # for idx, (lon, lat) in enumerate(lat_lons): # print idx, len(lat_lons) # vals_pi.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\PI.tif', # lon, lat, replace_ras=False)) # vals_di.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\DI.tif', # lon, lat, replace_ras=False)) # # df['DI'] = vals_di # df['PI'] = vals_pi df = df[cols] data = df.as_matrix(columns=cols[1:]) target = df.as_matrix(columns=[self.var_target]).ravel() # Get training and testing splits splits = train_test_split(data, target, test_size=0.2) return cols, splits def train_ml_model(self): """ :return: """ logger.info('#########################################################################') logger.info('train_ml_model') logger.info('#########################################################################') ###################################################### # Load dataset ###################################################### cols, splits = self.get_data() data_train, data_test, target_train, target_test = splits # clf = ExtraTreesRegressor(500, n_jobs=constants.ncpu) # #clf = SVR(kernel='rbf', C=1e3, gamma=0.1) # #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3) # data = df_train.as_matrix(columns=cols[1:]) # convert dataframe column to matrix # #data = preprocessing.scale(data) # target = df_train.as_matrix(columns=[self.var_target]).ravel() # convert dataframe column to matrix # clf.fit(data, target) # # predict_val = clf.predict(after.as_matrix(columns=cols[1:])) # results = compute_stats.ols(predict_val.tolist(), after_target.tolist()) # print results.rsquared # import matplotlib.pyplot as plt # plt.scatter(after_target, predict_val) # plt.show() # pdb.set_trace() if not os.path.isfile(self.path_pickle_model): # For details in scikit workflow: See http://stackoverflow.com/questions/ # 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea # TODO Separate out a dataset so that even the grid search cv can be tested ############################ # Select features from model ############################ logger.info('Selecting important features from model') if self.classify: rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu) else: rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu) feat_selection = SelectFromModel(rf_feature_imp) pipeline = Pipeline([ ('fs', feat_selection), ('clf', self.model), ]) ################################# # Grid search for best parameters ################################# C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) logger.info('Tuning hyperparameters') param_grid = { 'fs__threshold': ['mean', 'median'], 'fs__estimator__max_features': ['auto', 'log2'], 'clf__max_features': ['auto', 'log2'], 'clf__n_estimators': [1000, 2000] #'clf__gamma': np.logspace(-9, 3, 13), #'clf__C': np.logspace(-2, 10, 13) } gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan) # Fir the data before getting the best parameter combination. Different data sets will have # different optimized parameter combinations, i.e. without data, there is no optimal parameter combination. gs.fit(data_train, target_train) logger.info(gs.best_params_) data_test = pd.DataFrame(data_test, columns=cols[1:]) # Update features that should be used in model selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]]) cols = selected_features[0] data_test = data_test[cols] # Update model with the best parameters learnt in the previous step self.model = gs.best_estimator_.named_steps['clf'] predict_val = self.model.predict(data_test) results = compute_stats.ols(predict_val.tolist(), target_test.tolist()) print results.rsquared print cols plt.scatter(target_test, predict_val) plt.show() pdb.set_trace() ################################################################### # Output and plot importance of model features, and learning curves ################################################################### self.output_model_importance(gs, 'clf', num_cols=len(cols[1:])) if constants.plot_model_importance: train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold, n_jobs=constants.ncpu) plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve', ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir) # Save the model to disk logger.info('Saving model and features as pickle on disk') with open(self.path_pickle_model, 'wb') as f: cPickle.dump(self.model, f) with open(self.path_pickle_features, 'wb') as f: cPickle.dump(self.vars_features, f) else: # Read model from pickle on disk with open(self.path_pickle_model, 'rb') as f: logger.info('Reading model from pickle on disk') self.model = cPickle.load(f) logger.info('Reading features from pickle on disk') self.vars_features = pd.read_pickle(self.path_pickle_features) return df_cc def do_forecasting(self, df_forecast, mon_names, available_target=False, name_target='yield'): """ 1. Does classification/regression based on already built model. 2. Plots confusion matrix for classification tasks, scatter plot for regression 3. Plots accuracy statistics for classification/regression :param df_forecast: :param mon_names: :param available_target: Is target array available? :param name_target: Name of target array (defaults to yield) :return: """ data = df_forecast.as_matrix(columns=self.vars_features) # convert dataframe column to matrix predicted = self.model.predict(data) if available_target: expected = df_forecast.as_matrix(columns=[name_target]).ravel() if not self.classify: # REGRESSION # Compute stats results = compute_stats.ols(predicted.tolist(), expected.tolist()) bias = compute_stats.bias(predicted, expected) rmse = compute_stats.rmse(predicted, expected) mae = compute_stats.mae(predicted, expected) # Plot! plot.plot_regression_scatter(expected, np.asarray(predicted), annotate=r'$r^{2}$ ' + '{:0.2f}'.format(results.rsquared) + '\n' + 'peak NDVI date: ' + self.time_peak_ndvi.strftime('%b %d'), xlabel='Expected yield', ylabel='Predicted yield', title=mon_names + ' ' + str(int(df_forecast[self.season].unique()[0])), fname=self.task + '_' + '_'.join([mon_names]) + '_' + self.crop, out_path=self.path_out_dir) # global expected vs predicted if self.debug: # any non-existing index will add row self.df_global.loc[len(self.df_global)] = [np.nanmean(expected), np.nanmean(predicted), mon_names, self.forecast_yr] return predicted, {'RMSE': rmse, 'MAE': mae, r'$r^{2}$': results.rsquared, 'Bias': bias} else: # CLASSIFICATION # Convert from crop condition class (e.g. 4) to string (e.g. exceptional) expected, predicted = compute_stats.remove_nans(expected, predicted) cm = confusion_matrix(expected, predicted, labels=self.dict_cc.keys()).T # Compute and plot class probabilities proba_cc = self.model.predict_proba(data) df_proba = pd.DataFrame(proba_cc, columns=self.dict_cc.values()) plot.plot_class_probabilities(df_proba, fname='proba_' + '_'.join([mon_names]) + '_' + self.crop, out_path=self.path_out_dir) # Plot confusion matrix plot.plot_confusion_matrix(cm, normalized=False, fname='cm_' + '_'.join([mon_names]) + '_' + self.crop, xlabel='True class', ylabel='Predicted class', ticks=self.dict_cc.values(), out_path=self.path_out_dir) # Normalize and plot confusion matrix cm_normalized = normalize(cm.astype(float), axis=1, norm='l1') plot.plot_confusion_matrix(cm_normalized, fname='norm_cm_' + '_'.join([mon_names]) + '_' + self.crop, xlabel='True class', ylabel='Predicted class', normalized=True, ticks=self.dict_cc.values(), out_path=self.path_out_dir) score_accuracy = accuracy_score(expected, predicted) * 100.0 score_precision = precision_score(expected, predicted, average='weighted') * 100.0 return predicted, {'Accuracy': score_accuracy, 'Precision': score_precision} else: return predicted, {'RMSE': np.nan, 'MAE': np.nan, r'$r^{2}$': np.nan, 'Bias': np.nan, 'Nash-Sutcliff': np.nan}
y_pre_adb = adb.predict(x_test) #print(Evaluation([y_pre],[y_test])) RigeLinearCV = linear_model.RidgeCV(cv=3) clf1 = RigeLinearCV.fit(x_train, y_train) y_pre_cv = clf1.predict(x_test) #uni_pret.append(0.8*np.array(y_pre_adb)+0.2*np.array(y_pre_cv)) #uni_pre.append(y_pre_adb) ### params_rf = { 'n_estimators': 800, 'min_samples_split': 1, 'warm_start': True, 'n_jobs': 2 } rf = RandomForestRegressor(**params_rf) rf.fit(x_train, y_train) y_pre_rf = rf.predict(x_test) ### uni_pre = 0.4 * np.array(y_pre_adb) + 0.1 * np.array( y_pre_cv) + 0.5 * np.array(y_pre_rf) output(fw, i + 1, uni_pre) print(i) i += 1 fr1.close() fr2.close() fw.close()
def predict_proba(self, X): pred = RandomForestRegressor.predict(self, X) result = numpy.zeros([len(X), 2]) result[:, 1] = special.expit(pred / 1000.) result[:, 0] = 1. - result[:, 1] return result
class ItemSetModel(object): """docstring for ItemSetModel""" clf = None MODEL_PATH = os.path.join(settings.BASE_DIR, 'set_analyzer', 'analysis', 'models') CACHE_FILE = os.path.join(MODEL_PATH, 'model_cache.cache') def __init__(self): super(ItemSetModel, self).__init__() #self.clf = DecisionTreeRegressor() #self.clf = Lasso(0.1) #self.clf = SVR(kernel='rbf') #self.clf = ElasticNetCV() self.clf = RandomForestRegressor(max_depth=7, n_estimators=10) def get_data_sets(self, num_matches, cache=False, **kwargs): """ Data Schema: Input: 1 My champion ID 6 My Champion's class info 6 [Other team's cumulative class info] 7 [7 Final Items] 5 [first 5 items purchased] ________________________________________ 25 features Output: Score = A(Gold/time) + B(xp/time) + C(win) ________________________________________ 1 Output """ #Presize data features = 25 num_participants = num_matches*10 input_data = np.zeros((num_participants, features)) output_data = np.zeros(num_participants) row_num = 0 get_champ_id = lambda x : x.champion.champion_id diff_team = lambda x , y : x.team_id != y.team_id item_purchased = lambda x: x.event_type == "ITEM_PURCHASED" #Iterate over every match in the database for match in Match.objects(**kwargs)[:num_matches]: #Prepare users and teams team_map = {} team_data = np.zeros((2,6)) #Store the sum of each team's tags count = 0 for tag in match.teams: team_map[int(tag)] = count count+=1 #Prepare champion class data for p in match.participants.values(): for tag in p.champion.tags: team_data[team_map[p.team_id], :] += np.array(p.champion.class_data) #Iterate over every user in the match for pid, participant in match.participants.items(): col_num = 0 #My Champion's info input_data[row_num][col_num] = get_champ_id(participant) col_num+=1 input_data[row_num][col_num:col_num+6] = np.array(participant.champion.class_data) col_num+=6 #Other Team's champion attributes if(team_map[participant.team_id] == 0): input_data[row_num][col_num:col_num+6] = team_data[1,:] else: input_data[row_num][col_num:col_num+6] = team_data[0,:] col_num+=6 #My items for item_id in participant.final_build: input_data[row_num][col_num] = item_id col_num+=1 #My Item purchases count = 0 for item_purchase in (x for x in participant.item_events if item_purchased(x)): if(count==5): break input_data[row_num][col_num] = item_purchase.payload['itemId'] col_num += 1 count += 1 #Score # Assume that average gold/sec is ~8 # Assume that average kda is ~2.6 # Have a game win worth some bonus score = participant.kda()*3 + participant.gold_earned/match.duration + (4 if match.teams[str(participant.team_id)].won else 0) output_data[row_num] = score row_num+=1 if(cache): print('Caching data...') self.cache_data((input_data, output_data)) return (input_data, output_data) def cache_data(self, data): with open(self.CACHE_FILE, 'wb') as f: pickle.dump(data, f) def get_cached_data(self, num_rows): with open(self.CACHE_FILE, 'rb') as f: return pickle.load(f)[:num_rows] def train(self, X, Y, train_ratio=1, **kwargs): print("Training model...") if(train_ratio==1): print("Using {} rows".format(len(X))) self.clf.fit(X,Y) else: n = len(X) tn = int(n*train_ratio) print("Using {} rows".format(tn)) self.clf.fit(X[:tn,:],Y[:tn]) print("Evaluating model...") evaluate_fit(self.clf, X[tn:,:],Y[tn:]) def predict(self, X): return self.clf.predict(X) #MODEL EVAUATION def k_fold(self, folds, **kwargs): X, Y = self.get_data_sets(**kwargs) k_fold_evaluate(self.clf, X, y, folds) #LOAD AND SAVE def save(self, filename): dirname = os.path.join(self.MODEL_PATH, filename) if(not os.path.exists(dirname)): os.makedirs(dirname) else: #Empty folder for file in os.listdir(dirname): file_path = os.path.join(dirname, file) if os.path.isfile(file_path): os.unlink(file_path) path = os.path.join(dirname, "{}.pkl".format(filename)) joblib.dump(self.clf, path) def load(self, filename): path = os.path.join(self.MODEL_PATH, filename, "{}.pkl".format(filename)) self.clf = joblib.load(path)
# Initiate the monthly trade object monthData = trade_model.monthlyModel(1, 2009, 6, 2013, 6, 2012, 6, 2013) # Download data from Yahoo finance monthData.monthlyDataDownload() # Pre-processing of training an testing data monthData.trainFeaturePre() # Read pre-processed data from hard drive # monthData.trainFeaturePreHd() # Number of training months trainSpan = len(monthData.xTrain[:,0,0]) - monthData.testSpan # Initiate a random forest regressor clf = RandomForestRegressor(n_estimators=10) # totalReturn = 1 predictedReturn = np.zeros(monthData.stockNum) monthlyReturn = np.zeros(monthData.testSpan) aggReturn = np.zeros(monthData.testSpan+1) aggReturn[0] = 1 # rolling training and testing for j in range(0, monthData.testSpan): for i in range(0, monthData.stockNum): clf.fit(monthData.xTrain[j:trainSpan+j, :, i], monthData.yTrain[j:trainSpan+j, 0, i]) predictedReturn[i] = clf.predict(monthData.xTest[j, :, i]) monthlyReturn[j] = monthData.por10Returns(j, predictedReturn) yearReturn = totalReturn * (monthlyReturn[j]+1) aggReturn[j+1] = aggReturn[j]*(1+monthlyReturn[j]) print monthlyReturn print 'overall:', totalReturn sp.portfolioVSspy(6, 2012, 6, 2013, aggReturn[1:])
for j in range(0, 15): s = s + '' + str(valoresVol[i + j]) + ',' d.append(valoresVol[i + j]) #maxValores = max(valoresCopiar[longValores:i+j]) # Esta cambia porque no debemos tener los valores #maxValores = max(valoresCopiar[i+j:i+j+5]) maxValores = max(valoresCopiar[i + 14 + longValoresTest:i + longValoresTest + 14 + 5]) #s = s + str(150) #d.append(150) X_test.append(d) y_test.append(maxValores) # Entrenamos regr = RandomForestRegressor() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) regr2 = SVR(kernel='rbf', C=1e3, gamma=0.1) regr2.fit(X_train, y_train) y_pred2 = regr2.predict(X_test) regr3 = linear_model.LinearRegression() regr3.fit(X_train, y_train) y_pred3 = regr3.predict(X_test) # Votacion # Si todos OK => Se invierte votacion = 0 max_pred_array = [max(y_pred), max(y_pred2), max(y_pred3)] for i_voto in max_pred_array: ganancia_pred = (i_voto - cierreEvaluar) / cierreEvaluar
for i in range(10): X, y = shuffle(boston.data, boston.target) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140) regressor2 = DecisionTreeRegressor(max_depth=6) regressor3 = LinearRegression() regressor4 = RandomForestRegressor() regressor.fit(X_train, y_train) regressor2.fit(X_train, y_train) regressor3.fit(X_train, y_train) regressor4.fit(X_train, y_train) y_pred = regressor.predict(x) y_pred2 = regressor2.predict(x) y_pred3 = regressor3.predict(x) y_pred4 = regressor4.predict(x) predictions.append(y_pred) predictions2.append(y_pred2) predictions3.append(y_pred3) predictions4.append(y_pred4) print "\nPrediction = " + str(y_pred) print "Prediction = " + str(y_pred2) print "Prediction = " + str(y_pred3) print "Prediction = " + str(y_pred4) print '\n' print 'Boosting max', np.max(predictions), 'min', np.min(predictions), 'variance', np.max(predictions) - np.min(predictions) print 'Decision tree max', np.max(predictions2), 'min', np.min(predictions2), 'variance', np.max(predictions2) - np.min(predictions2) print 'Random forest max', np.max(predictions4), 'min', np.min(predictions4), 'variance', np.max(predictions4) - np.min(predictions4) print 'Linear regression max', np.max(predictions3), 'min', np.min(predictions3), 'variance', np.max(predictions3) - np.min(predictions3)
for v in timestampDoubleData: timestampData2.append(str(int(v))) # modelling for location in locations: trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data, featureTW, "target", timestampData) print("\tT+W (on data without ATC) #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) for i in range(0, len(testY)): timestamp = testTimestamp[i] value = prediction[i] TWpredictionData[str(location)][timestamp] = value trainX, testX, trainY, testY, trainTimestamp, testTimestamp = splitDataForXValidation( location, "location", data2, featureTWAtc, "target", timestampData2) print("\tT+W+Atc #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42)
for i in range(10): X, y = shuffle(boston.data, boston.target) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] regressor = GradientBoostingRegressor(max_depth=20, n_estimators=140) regressor2 = DecisionTreeRegressor(max_depth=6) regressor3 = LinearRegression() regressor4 = RandomForestRegressor() regressor.fit(X_train, y_train) regressor2.fit(X_train, y_train) regressor3.fit(X_train, y_train) regressor4.fit(X_train, y_train) y_pred = regressor.predict(x) y_pred2 = regressor2.predict(x) y_pred3 = regressor3.predict(x) y_pred4 = regressor4.predict(x) predictions.append(y_pred) predictions2.append(y_pred2) predictions3.append(y_pred3) predictions4.append(y_pred4) print "\nPrediction = " + str(y_pred) print "Prediction = " + str(y_pred2) print "Prediction = " + str(y_pred3) print "Prediction = " + str(y_pred4) print '\n' print 'Boosting max', np.max(predictions), 'min', np.min( predictions), 'variance', np.max(predictions) - np.min(predictions) print 'Decision tree max', np.max(predictions2), 'min', np.min( predictions2), 'variance', np.max(predictions2) - np.min(predictions2) print 'Random forest max', np.max(predictions4), 'min', np.min(
feature_index, target_index) clf_adaboost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8), n_estimators=50, loss='linear', random_state=0) clf_extra_trees = ExtraTreesRegressor(n_estimators=8, random_state=0, max_depth=30) clf_random_forest = RandomForestRegressor(n_estimators=8, random_state=0, max_depth=30) clf_adaboost.fit(all_data_test.T, all_targets_test[0]) predicted = clf_adaboost.predict(all_data_valid.T) clf_extra_trees.fit(all_data_test.T, all_targets_test[0]) predicted_extra = clf_extra_trees.predict(all_data_valid.T) clf_random_forest.fit(all_data_test.T, all_targets_test[0]) predicted_forest = clf_random_forest.predict(all_data_valid.T) delta_ada = all_targets_valid[0] - predicted delta_extra = all_targets_valid[0] - predicted_extra delta_forest = all_targets_valid[0] - predicted_forest std_ada = get_standart_deviation(delta_ada) std_extra = get_standart_deviation(delta_extra) std_forest = get_standart_deviation(delta_forest) plt.hist(delta_ada, bins=150, color='g', label='Adaboost '+str(np.round(std_ada,4))) plt.hist(delta_extra, bins=150, color='b', label='Extra_Trees '+str(np.round(std_extra,4))) plt.hist(delta_forest, bins=150, color='r', label='Random_Forest '+str(np.round(std_forest,4))) title = "Compare adaboost, extra_tree and Random_Forests" plt.title(title) plt.legend(loc='upper left')
#test = test.join(pd.DataFrame(test.Date.apply(splitTime).tolist(), columns = ['year','mon','day'])) #newtest = test.drop('StateHoliday',1).join(pd.get_dummies(test['StateHoliday']).rename(columns=lambda x: 'StateHoliday' +"_"+str(x))) #newtest = pd.merge(newtest,store, on="Store") #newtest.drop(['Date'],axis = 1,inplace=True) #assert(np.sum(newtrain.var()==0)==0) # #toDrop = list(set(newtrain.columns.values)-set(newtest.columns.values) ) features = [col for col in newtrain.columns if col not in ['Customers', 'Sales', 'Date','LogSale','datetimes']] # rf = RandomForestRegressor(n_estimators=100) print('Starting training...') rf.fit(newtrain[features].fillna(-1),newtrain.LogSale) print('Predicting train values...') newtrain['mypred'] = rf.predict(newtrain[features].fillna(-1)) newtrain['mypred'] = np.exp(newtrain['mypred'])-1 train_error = rmspe(newtrain[newtrain.Sales>0].Sales,newtrain[newtrain.Sales>0].mypred) print('train set error',train_error) newtest['mypred'] = rf.predict(newtest[features].fillna(-1)) newtest['mypred'] = np.exp(newtest['mypred'])-1 test_error = rmspe(newtest[newtest.Sales>0].Sales,newtest[newtest.Sales>0].mypred) print('test set error',test_error) train_results.append(train_error) test_results.append(test_error) print('mean train error', np.mean(train_results)) print('mean test error',np.mean(test_results))