def train_model(train_x, train_y, test_x, test_y): print("start train model") # Create random forest classifier instance # trained_model = RandomForestClassifier(verbose=1, n_estimators=1,warm_start=True,n_jobs=-1) # batch_size = 4 # split = len(train_x) / batch_size # print("Split data to: " + str(split)) # step = 0 # while (step < split): # print("Step number: " + str(step)) # trained_model.n_estimators = trained_model.n_estimators + 4 # trained_model.fit(train_x[step*batch_size : (step+1)*batch_size], train_y[step*batch_size : (step+1)*batch_size]) # print("length of batch: " + str(len(train_x[step*batch_size : (step+1)*batch_size]))) # step += 1 trained_model = RandomForestClassifier(verbose=1, n_estimators=4,warm_start=True, n_jobs=-1) batch_size = 2 split = len(train_x) / batch_size + 1 print("Split data to :" + str(split-1)) step = 1 while (step < split): print("Step number: " + str(step - 1)) print("length of batch: " + str(len(train_x[batch_size*(step-1):step*batch_size]))) trained_model.fit(train_x[batch_size*(step-1):step*batch_size], train_y[batch_size*(step-1):step*batch_size]) trained_model.n_estimators = trained_model.n_estimators + 4 step += 1 trained_model.n_estimators = trained_model.n_estimators - 4 print("Trained model :: " +str(trained_model) ) predictions = trained_model.predict(test_x[0]) pred = [] for p in predictions: pred.append(np.array(p,dtype=int)) print(predictions) print(np.mean(predictions[0])) print(np.mean(test_y[0])) miou = 0 for p,y in zip(pred,test_y): miou = miou + cal_miou(p,y) miou = miou / len(pred) print("mIOU :: " + str(miou)) s = joblib.dump(trained_model, 'model.pkl',compress=9) print("model saved")
def rfcScores(self,Xn,y,cv=5,param_name='max_depth',estimatorsRange=(10,11,1),paramRange=(1,10,1),trainW=1,testW=2,title='Randorm Forest classifier',clfArg={},plot=False): """ Perform the validation_curve function using Random Forest classifier (RFC) and get the best param value based on the highest test_score. cv indicates the cross validation k-fold. Default param to optimize is max_depth. paramRange=(a,b,c) is the range to evaluate the param_name. a start degree, b end degree, c step. estimatorsRange=(a,b,c) is the range to evaluate the number of estimators (n_estimators). After the function gets the best param value, associated test_score and train_score are used to calculated a weighted_score. trainW and testW are the weights used to calculated a weighted_score=test_score*testW+train_score*trainW)/(testW+trainW). clfArg is a dictionary to add any additional parameters to the RFC. To see how the best score is collected set plot=True. The function calculates the scores for the RFC criterions gini and entropy. """ clf=RFC(**clfArg) model_scores=list() param_range=np.arange(paramRange[0],paramRange[1],paramRange[2]) e_range=np.arange(estimatorsRange[0],estimatorsRange[1],estimatorsRange[2]) criterions=['gini','entropy'] for criterion in criterions: clf.criterion=criterion for e in e_range: clf.n_estimators=e dtitle=title+". Criterion: "+criterion+". Estimators: "+str(e) train_sc, test_sc = validation_curve(clf,Xn,y,param_name=param_name,param_range=param_range,cv=cv) param_score=self.plotTrainTest(train_sc,test_sc,param_range,t=dtitle,xlabel=param_name,plot=plot) scoreDic={'model':dtitle,'param_name':param_name} scoreDic.update(param_score) model_scores.append(scoreDic.copy()) return self.scoreModelListDf(model_scores,trainW=trainW,testW=testW)
def find_best_model(df, contaminant, verbose=False): train_data, test_data = splitData(df[df.contaminant == contaminant]) ### make sure the values make sense: if verbose: print('Contaminant ', contaminant) print('Status Levels: ', df.status.unique()) print('Status Codes: ', df.status_numeric.unique()) print('train data sample size', train_data.size) print('test data sample size', test_data.size) train_labels = train_data.status_numeric # create model templates RF = RandomForestClassifier() kNN = KNeighborsClassifier() kNN_scores = [] RF_scores = [] for p in range(2, 100): kNN.n_neighbors = p RF.n_estimators = p kNN.fit(X=train_data[['lat', 'lng', 'time_delta']], y=train_data.status_numeric) kNN_scores.append((p, kNN.score(X=test_data[['lat', 'lng', 'time_delta']], y=test_data.status_numeric))) RF.fit(X=train_data[['lat', 'lng', 'time_delta']], y=train_data.status_numeric) RF_scores.append((p, RF.score(X=test_data[['lat', 'lng', 'time_delta']], y=test_data.status_numeric))) # find the most accurate model and parameter if max(kNN_scores, key=lambda x: x[1])[1] > max(RF_scores, key=lambda x: x[1])[1]: return contaminant, "kNN", max(kNN_scores, key=lambda x: x[1]) else: return contaminant, "RF", max(RF_scores, key=lambda x: x[1])
from sklearn.model_selection import cross_val_score df = pd.read_csv('/Users/sherry/Downloads/wine.csv') #df.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium','Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] #print('Class labels', np.unique(df['Class label'])) df.head() X, y = df.iloc[:, 1:].values, df.iloc[:, 13].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y) from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators=500) for i in [1, 2, 5, 10, 25, 50, 100, 500]: forest.n_estimators = i # y_pred=forest.predict(X_test) score = cross_val_score(estimator=forest, X=X_train, y=y_train, cv=10, scoring='accuracy') # score=metrics.accuracy_score(y_test, y_pred) print(np.mean(score)) forest.fit(X_train, y_train) feat_labels = df.columns[:-1] from sklearn.model_selection import GridSearchCV param_grid = {'n_estimators': [1, 2, 5, 10, 25, 50, 100, 500]}
train_data2 = train[[ "num_commits_open", "lines_modified_open", "files_modified_open", "commits_on_files_touched", "branch_hotness" ]] train_label2 = train["useful"] predict_data2 = row[[ "num_commits_open", "lines_modified_open", "files_modified_open", "commits_on_files_touched", "branch_hotness" ]] predict_data2 = pd.DataFrame([predict_data2]) predict_label2 = row[["useful"]] if train_data2.shape[0] != 0: clf.n_estimators = clf.n_estimators + 100 clf.fit(train_data2, train_label2) result_predict = clf.predict(predict_data2) # probaがどっちに分類されるかの確率を出しているのでこっち使う # [[有益確率, 無益確率] ... ] 有益確率だけあれば良い result_predict_proba = clf.predict_proba(predict_data2) # usefulの確率だけ取り出し print(result_predict_proba[:, 0]) print("predict=", result_predict) result = (result_predict == predict_label2.values) if result: useful_match2[i] = useful_match2[i] + 1
plt.show() # ## Comparison # ### Accuracy vs. number of trees rf = RandomForestClassifier( n_estimators=200, # Number of Trees grown max_features=min( 10, n_features), # Number of randomly picked features for split max_depth=5, # Max Number of nested splits random_state=42, ) res = [] for i in range(1, 150, 1): rf.n_estimators = i rf.fit(X_train, y_train) d = dict({'n_estimators': i}) d.update({'train': rf.score(X_train, y_train)}) d.update({'test': rf.score(X_test, y_test)}) res.append(d) res = pandas.DataFrame(res) res.plot('n_estimators') plt.ylabel('Accuracy') plt.xlabel('Number of trees') plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Dataset', fancybox=False) plt.savefig('RF_accuracy_number_of_trees.png', dpi=300, transparent=True) plt.tight_layout()
plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=20, n_features=2, centers=2, cluster_std=2, random_state=3) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolors='k') from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=8) clf.max_depth = 1 clf.n_estimators = 1 clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k') plotBoundary(X, clf) for i in range(3, 10): clf.n_estimators = i clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plotBoundary(X, clf) plt.title("{0} estimators".format(i)) plt.show()
import pandas as pd #1 from sklearn.model_selection import train_test_split #2 from sklearn.ensemble import RandomForestClassifier #3 from sklearn.metrics import confusion_matrix #4 # import libs datas = pd.read_csv("datas.csv") # read datas #1 x = datas.iloc[:, 3:-3].values y = datas.iloc[:, -2].values # split values x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.10, random_state=0) # 90% for train, %10 for test #2 rfc = RandomForestClassifier() #3 rfc.max_depth = 100 rfc.criterion = "entropy" #select criterion,other criterion is 'gini' rfc.n_estimators = 1 rfc.fit(x_train, y_train) y_pred = rfc.predict(x_test) cm = confusion_matrix(y_test, y_pred) #4 print("RFC") print(cm)
model = RandomForestClassifier(n_jobs=6) if args.CV: parameters = {'n_estimators': [150, 175, 200], 'oob_score': [True, False]} from sklearn import grid_search clf = grid_search.GridSearchCV(model, parameters, cv=4, verbose=10, n_jobs=1) print 'Grid Search for the model' clf.fit(X_trn, y_trn) print clf.best_params_ model.n_estimators = clf.best_params_['n_estimators'] model.oob_score = clf.best_estimator_['oob_score'] else: model.n_estimators = 600 model.oob_score = False model.max_depth = 20 model.n_jobs = 20 from sklearn import cross_validation as cv if args.SGD: from SGDRank import SGDClassifier model = SGDClassifier()
# %% from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=50, max_depth=50, random_state=10, bootstrap=False, warm_start=True, criterion='entropy', class_weight={ 0: 0.01, 1: 1 }) clf.n_estimators = 50 clf.fit(x_train, y_train) predicted_labels = clf.predict(x_test) hits = np.sum((predicted_labels == 0) * (y_test == 0)) / np.sum(y_test == 0) fp = np.sum((predicted_labels == 0) * (y_test > 0)) / np.sum(y_test > 0) confusion_matrix = np.zeros((5, 5)) for i in range(5): for j in range(5): confusion_matrix[i, j] = np.sum( (predicted_labels == i) * (y_test == j)) / np.sum(y_test == j) overall = np.sum(((predicted_labels == 0) * (y_test == 0)) +
def app_flow(self): # This method contains a state machine for the slave and master instance # === States === state_initializing = 1 state_read_input = 2 state_share_samples = 3 state_gather_1 = 4 state_wait_1 = 5 state_train_local = 6 state_gather_2 = 7 state_wait_2 = 8 state_global_ready = 9 state_finishing = 10 # Initial state state = state_initializing self.progress = 'initializing...' while True: if state == state_initializing: if self.id is not None: # Test if setup has happened already state = state_read_input # COMMON PART if state == state_read_input: print('Reading input...') base_dir = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir)) def read_input_train(ins, path): d = pd.read_csv(path, sep=self.sep) data_X = d.drop(self.label, axis=1) data_y = d[self.label] if ins.split_test is not None: ins.data = pd.read_csv(os.path.join(base_dir, ins.input_train), sep=ins.sep) data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(data_X, data_y, test_size=ins.split_test) ins.data_X_train.append(data_X_train) ins.data_y_train.append(data_y_train) ins.data_X_test.append(data_X_test) ins.data_y_test.append(data_y_test) else: ins.data_X_train.append(data_X) ins.data_y_train.append(data_y) def read_input_test(ins, path): d = pd.read_csv(path, sep=ins.sep) data_X = d.drop(ins.label, axis=1) data_y = d[ins.label] ins.data_X_test.append(data_X) ins.data_y_test.append(data_y) if self.split_mode == 'directory': for split_name in os.listdir(base_dir): read_input_train(self, os.path.join(base_dir, split_name, self.input_train)) if self.input_test is not None: read_input_test(self, os.path.join(base_dir, split_name, self.input_test)) elif self.split_mode == 'file': read_input_train(self, os.path.join(base_dir, self.input_train)) if self.input_test is not None: read_input_test(self, os.path.join(base_dir, self.input_test)) split_samples = [i.shape[0] for i in self.data_y_train] self.my_samples = sum(split_samples) // len(split_samples) print(f'Read input. Have {split_samples} samples.') if self.master: self.data_incoming.append(pickle.dumps({ 'samples': self.my_samples })) state = state_gather_1 else: self.data_outgoing = pickle.dumps({ 'samples': self.my_samples }) self.status_available = True state = state_wait_1 if state == state_train_local: print('Calculate local values...') rfs = [] for i in range(len(self.data_X_train)): global_rf = None trees = int(self.estimators_total * self.my_samples / self.total_samples) if self.mode == 'classification': global_rf = RandomForestClassifier(n_estimators=trees, random_state=self.random_state) elif self.mode == 'regression': global_rf = RandomForestRegressor(n_estimators=trees, random_state=self.random_state) global_rf.fit(self.data_X_train[i], self.data_y_train[i]) rfs.append({ 'rf': global_rf, }) print(f'Trained random forests') if self.master: self.data_incoming.append(pickle.dumps(rfs)) state = state_gather_2 else: self.data_outgoing = pickle.dumps(rfs) self.status_available = True state = state_wait_2 if state == state_global_ready: print(f'Forest done') results_pred = [] results_proba = [] results_test = [] for i in range(len(self.data_X_train)): results_pred.append(self.rfs[i].predict(self.data_X_test[i])) if self.mode == 'classification': results_proba.append(self.rfs[i].predict_proba(self.data_X_test[i])) results_test.append(self.data_y_test[i]) def write_output(path, data): df = pd.DataFrame(data=data) df.to_csv(path, index=False, sep=self.sep) print(f'Writing output') base_dir_in = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir)) base_dir_out = os.path.normpath(os.path.join(f'/mnt/output/', self.split_dir)) if self.split_mode == 'directory': for i, split_name in enumerate(os.listdir(base_dir_in)): write_output(os.path.join(base_dir_out, split_name, self.output_pred), {'pred': results_pred[i][:]}) if self.mode == 'classification': write_output(os.path.join(base_dir_out, split_name, self.output_proba), {'prob_0': results_proba[i][:, 0], 'prob_1': results_proba[i][:, 1]}) write_output(os.path.join(base_dir_out, split_name, self.output_test), {'y_true': results_test[i]}) elif self.split_mode == 'file': write_output(os.path.join(base_dir_out, self.output_pred), {'pred': results_pred[0][:]}) if self.mode == 'classification': write_output(os.path.join(base_dir_out, self.output_proba), {'prob_0': results_proba[0][:, 0], 'prob_1': results_proba[0][:, 1]}) write_output(os.path.join(base_dir_out, self.output_test), {'y_true': results_test[0]}) if self.master: self.data_incoming.append('DONE') state = state_finishing else: self.data_outgoing = 'DONE' self.status_available = True break # GLOBAL PART if state == state_gather_1: if len(self.data_incoming) == len(self.clients): client_data = [] for local_rfs in self.data_incoming: client_data.append(pickle.loads(local_rfs)) self.data_incoming = [] total_samples = sum([cd['samples'] for cd in client_data]) self.total_samples = total_samples self.data_outgoing = pickle.dumps(total_samples) self.status_available = True state = state_train_local else: print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...') if state == state_gather_2: if len(self.data_incoming) == len(self.clients): client_data = [] for local_rfs in self.data_incoming: client_data.append(pickle.loads(local_rfs)) self.data_incoming = [] data_outgoing = [] for i in range(len(self.data_X_train)): global_rf = None # total_samples = 0 # for d in client_data: # total_samples += d[i]['samples'] for d in client_data: drf = d[i]['rf'] # perc = d[i]['samples'] / total_samples # trees = int(perc * self.estimators_total) if global_rf is None: global_rf = drf global_rf.estimators_ = drf.estimators_ # global_rf.estimators_ = random.sample(drf.estimators_, trees) global_rf.n_estimators = drf.n_estimators else: global_rf.estimators_ += drf.estimators_ # global_rf.estimators_ += random.sample(drf.estimators_, trees) global_rf.n_estimators += drf.n_estimators data_outgoing.append(global_rf) self.rfs = data_outgoing self.data_outgoing = pickle.dumps(data_outgoing) self.status_available = True state = state_global_ready else: print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...') if state == state_finishing: if len(self.data_incoming) == len(self.clients): self.status_finished = True break # LOCAL PART if state == state_wait_1: if len(self.data_incoming) > 0: self.total_samples = pickle.loads(self.data_incoming[0]) self.data_incoming = [] state = state_train_local if state == state_wait_2: if len(self.data_incoming) > 0: self.rfs = pickle.loads(self.data_incoming[0]) self.data_incoming = [] state = state_global_ready time.sleep(1)
###################################################################### n_estimators_space = [1, 10, 50, 100, 500, 1000] n_estimators_space = np.arange(100) n_estimators_space = np.linspace(1, 1000, dtype=int, endpoint=False, num=20) n_estimators_space = np.logspace(0, 3, 10, dtype=int) rf_scores = [] rf_scores_std = [] rfclass = RandomForestClassifier(n_estimators=500) # Compute scores over range of alphas for alpha in n_estimators_space: # Specify the alpha value to use: ridge.alpha rfclass.n_estimators = alpha # Perform 10-fold CV: ridge_cv_scores rf_cv_scores = cross_val_score(rfclass, X_train, y_train, cv=10, scoring='accuracy') # Append the mean of ridge_cv_scores to ridge_scores rf_scores.append(np.mean(rf_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std rf_scores_std.append(np.std(rf_cv_scores)) # Display the plot