Example #1
0
def train_model(train_x, train_y, test_x, test_y):
	print("start train model")
	# Create random forest classifier instance
	# trained_model = RandomForestClassifier(verbose=1, n_estimators=1,warm_start=True,n_jobs=-1)

	# batch_size = 4
	# split = len(train_x) / batch_size 
	# print("Split data to: " + str(split))
	# step = 0

	# while (step < split):
	# 	print("Step number: " + str(step))
	# 	trained_model.n_estimators = trained_model.n_estimators + 4
	# 	trained_model.fit(train_x[step*batch_size : (step+1)*batch_size], train_y[step*batch_size : (step+1)*batch_size])
	# 	print("length of batch: " + str(len(train_x[step*batch_size : (step+1)*batch_size])))
	# 	step += 1

	trained_model = RandomForestClassifier(verbose=1, n_estimators=4,warm_start=True, n_jobs=-1)

	batch_size = 2
	split = len(train_x) / batch_size + 1
	print("Split data to :" + str(split-1))
	step = 1

	while (step < split):
		print("Step number: " + str(step - 1))
		print("length of batch: " + str(len(train_x[batch_size*(step-1):step*batch_size])))
		trained_model.fit(train_x[batch_size*(step-1):step*batch_size], train_y[batch_size*(step-1):step*batch_size])
		trained_model.n_estimators = trained_model.n_estimators + 4
		step += 1

	trained_model.n_estimators = trained_model.n_estimators - 4
	print("Trained model :: " +str(trained_model) )

	predictions = trained_model.predict(test_x[0])
	pred = []

	for p in predictions:
		pred.append(np.array(p,dtype=int))

	print(predictions)
	print(np.mean(predictions[0]))
	print(np.mean(test_y[0]))

	miou = 0
	for p,y in zip(pred,test_y):
		miou = miou + cal_miou(p,y) 

	miou = miou / len(pred)
	print("mIOU :: " + str(miou))

	s = joblib.dump(trained_model, 'model.pkl',compress=9) 
	print("model saved")
	def rfcScores(self,Xn,y,cv=5,param_name='max_depth',estimatorsRange=(10,11,1),paramRange=(1,10,1),trainW=1,testW=2,title='Randorm Forest classifier',clfArg={},plot=False):
		"""
		Perform the validation_curve function using Random Forest classifier (RFC)
		 and get the best param value based on the highest test_score. 
		cv indicates the cross validation k-fold. Default param to optimize is max_depth. 
		paramRange=(a,b,c) is the range to evaluate the param_name. a start degree, b end degree, c step.
		estimatorsRange=(a,b,c) is the range to evaluate the number of estimators (n_estimators). 
		After the function gets the best param value, associated test_score and train_score
		 are used to calculated a weighted_score.
		trainW and testW are the weights used to calculated a 
		weighted_score=test_score*testW+train_score*trainW)/(testW+trainW).
		clfArg is a dictionary to add any additional parameters to the RFC. 
		To see how the best score is collected set plot=True. 
		The function calculates the scores for the RFC criterions gini and entropy.
		"""
		clf=RFC(**clfArg)
		model_scores=list()
		param_range=np.arange(paramRange[0],paramRange[1],paramRange[2])
		e_range=np.arange(estimatorsRange[0],estimatorsRange[1],estimatorsRange[2])
		criterions=['gini','entropy']
		for criterion in criterions:
			clf.criterion=criterion
			for e in e_range:
				clf.n_estimators=e
				dtitle=title+". Criterion: "+criterion+". Estimators: "+str(e)
				train_sc, test_sc = validation_curve(clf,Xn,y,param_name=param_name,param_range=param_range,cv=cv)
				param_score=self.plotTrainTest(train_sc,test_sc,param_range,t=dtitle,xlabel=param_name,plot=plot)
				scoreDic={'model':dtitle,'param_name':param_name}
				scoreDic.update(param_score)
				model_scores.append(scoreDic.copy())
		return self.scoreModelListDf(model_scores,trainW=trainW,testW=testW)
Example #3
0
def find_best_model(df, contaminant, verbose=False):
    train_data, test_data = splitData(df[df.contaminant == contaminant])

    ### make sure the values make sense:
    if verbose:
        print('Contaminant ', contaminant)
        print('Status Levels: ', df.status.unique())
        print('Status Codes: ', df.status_numeric.unique())
        print('train data sample size', train_data.size)
        print('test data sample size', test_data.size)

    train_labels = train_data.status_numeric

    # create model templates
    RF = RandomForestClassifier()
    kNN = KNeighborsClassifier()

    kNN_scores = []
    RF_scores = []
    for p in range(2, 100):
        kNN.n_neighbors = p
        RF.n_estimators = p

        kNN.fit(X=train_data[['lat', 'lng', 'time_delta']],
                y=train_data.status_numeric)
        kNN_scores.append((p,
                           kNN.score(X=test_data[['lat', 'lng', 'time_delta']],
                                     y=test_data.status_numeric)))

        RF.fit(X=train_data[['lat', 'lng', 'time_delta']],
               y=train_data.status_numeric)
        RF_scores.append((p,
                          RF.score(X=test_data[['lat', 'lng', 'time_delta']],
                                   y=test_data.status_numeric)))

    # find the most accurate model and parameter
    if max(kNN_scores, key=lambda x: x[1])[1] > max(RF_scores,
                                                    key=lambda x: x[1])[1]:
        return contaminant, "kNN", max(kNN_scores, key=lambda x: x[1])
    else:
        return contaminant, "RF", max(RF_scores, key=lambda x: x[1])
from sklearn.model_selection import cross_val_score

df = pd.read_csv('/Users/sherry/Downloads/wine.csv')
#df.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium','Total phenols', 'Flavanoids',  'Nonflavanoid phenols',  'Proanthocyanins', 'Color intensity', 'Hue',  'OD280/OD315 of diluted wines', 'Proline']
#print('Class labels', np.unique(df['Class label']))
df.head()
X, y = df.iloc[:, 1:].values, df.iloc[:, 13].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    stratify=y)

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=500)
for i in [1, 2, 5, 10, 25, 50, 100, 500]:
    forest.n_estimators = i

    #    y_pred=forest.predict(X_test)
    score = cross_val_score(estimator=forest,
                            X=X_train,
                            y=y_train,
                            cv=10,
                            scoring='accuracy')
    #    score=metrics.accuracy_score(y_test, y_pred)
    print(np.mean(score))

forest.fit(X_train, y_train)
feat_labels = df.columns[:-1]

from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [1, 2, 5, 10, 25, 50, 100, 500]}
Example #5
0
        train_data2 = train[[
            "num_commits_open", "lines_modified_open", "files_modified_open",
            "commits_on_files_touched", "branch_hotness"
        ]]
        train_label2 = train["useful"]

        predict_data2 = row[[
            "num_commits_open", "lines_modified_open", "files_modified_open",
            "commits_on_files_touched", "branch_hotness"
        ]]
        predict_data2 = pd.DataFrame([predict_data2])
        predict_label2 = row[["useful"]]

        if train_data2.shape[0] != 0:
            clf.n_estimators = clf.n_estimators + 100
            clf.fit(train_data2, train_label2)

        result_predict = clf.predict(predict_data2)
        # probaがどっちに分類されるかの確率を出しているのでこっち使う
        # [[有益確率, 無益確率] ... ] 有益確率だけあれば良い
        result_predict_proba = clf.predict_proba(predict_data2)

        # usefulの確率だけ取り出し
        print(result_predict_proba[:, 0])
        print("predict=", result_predict)

        result = (result_predict == predict_label2.values)
        if result:
            useful_match2[i] = useful_match2[i] + 1
    plt.show()

# ## Comparison

# ### Accuracy vs. number of trees

rf = RandomForestClassifier(
    n_estimators=200,  # Number of Trees grown
    max_features=min(
        10, n_features),  # Number of randomly picked features for split 
    max_depth=5,  # Max Number of nested splits
    random_state=42,
)
res = []
for i in range(1, 150, 1):
    rf.n_estimators = i
    rf.fit(X_train, y_train)
    d = dict({'n_estimators': i})
    d.update({'train': rf.score(X_train, y_train)})
    d.update({'test': rf.score(X_test, y_test)})
    res.append(d)
res = pandas.DataFrame(res)
res.plot('n_estimators')
plt.ylabel('Accuracy')
plt.xlabel('Number of trees')
plt.legend(loc='center left',
           bbox_to_anchor=(1, 0.5),
           title='Dataset',
           fancybox=False)
plt.savefig('RF_accuracy_number_of_trees.png', dpi=300, transparent=True)
plt.tight_layout()
Example #7
0
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)


from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=20,
                  n_features=2,
                  centers=2,
                  cluster_std=2,
                  random_state=3)

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolors='k')

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=8)
clf.max_depth = 1
clf.n_estimators = 1
clf.fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k')
plotBoundary(X, clf)

for i in range(3, 10):
    clf.n_estimators = i
    clf.fit(X, y)
    plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    plotBoundary(X, clf)
    plt.title("{0} estimators".format(i))
    plt.show()
import pandas as pd  #1
from sklearn.model_selection import train_test_split  #2
from sklearn.ensemble import RandomForestClassifier  #3
from sklearn.metrics import confusion_matrix  #4
# import libs

datas = pd.read_csv("datas.csv")  # read datas
#1

x = datas.iloc[:, 3:-3].values
y = datas.iloc[:, -2].values
# split values

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.10, random_state=0)  # 90% for train, %10 for test
#2

rfc = RandomForestClassifier()
#3
rfc.max_depth = 100
rfc.criterion = "entropy"  #select criterion,other criterion is 'gini'
rfc.n_estimators = 1
rfc.fit(x_train, y_train)

y_pred = rfc.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
#4
print("RFC")
print(cm)
Example #9
0
    model = RandomForestClassifier(n_jobs=6)

    if args.CV:
        parameters = {'n_estimators': [150, 175, 200],
                      'oob_score': [True, False]}

        from sklearn import grid_search
        clf = grid_search.GridSearchCV(model, parameters,
                                       cv=4, verbose=10,
                                       n_jobs=1)
        print 'Grid Search for the model'
        clf.fit(X_trn, y_trn)
        print clf.best_params_

        model.n_estimators = clf.best_params_['n_estimators']
        model.oob_score = clf.best_estimator_['oob_score']

    else:
        model.n_estimators = 600
        model.oob_score = False
        model.max_depth = 20
        model.n_jobs = 20
 

    from sklearn import cross_validation as cv
  
    if args.SGD: 
        from SGDRank import SGDClassifier
        model = SGDClassifier()
    
# %%

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50,
                             max_depth=50,
                             random_state=10,
                             bootstrap=False,
                             warm_start=True,
                             criterion='entropy',
                             class_weight={
                                 0: 0.01,
                                 1: 1
                             })

clf.n_estimators = 50
clf.fit(x_train, y_train)

predicted_labels = clf.predict(x_test)

hits = np.sum((predicted_labels == 0) * (y_test == 0)) / np.sum(y_test == 0)
fp = np.sum((predicted_labels == 0) * (y_test > 0)) / np.sum(y_test > 0)

confusion_matrix = np.zeros((5, 5))

for i in range(5):
    for j in range(5):
        confusion_matrix[i, j] = np.sum(
            (predicted_labels == i) * (y_test == j)) / np.sum(y_test == j)

overall = np.sum(((predicted_labels == 0) * (y_test == 0)) +
Example #11
0
    def app_flow(self):
        # This method contains a state machine for the slave and master instance

        # === States ===
        state_initializing = 1
        state_read_input = 2
        state_share_samples = 3
        state_gather_1 = 4
        state_wait_1 = 5
        state_train_local = 6
        state_gather_2 = 7
        state_wait_2 = 8
        state_global_ready = 9
        state_finishing = 10

        # Initial state
        state = state_initializing
        self.progress = 'initializing...'

        while True:

            if state == state_initializing:
                if self.id is not None:  # Test if setup has happened already
                    state = state_read_input

            # COMMON PART

            if state == state_read_input:
                print('Reading input...')
                base_dir = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir))

                def read_input_train(ins, path):
                    d = pd.read_csv(path, sep=self.sep)
                    data_X = d.drop(self.label, axis=1)
                    data_y = d[self.label]

                    if ins.split_test is not None:
                        ins.data = pd.read_csv(os.path.join(base_dir, ins.input_train), sep=ins.sep)
                        data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(data_X, data_y, test_size=ins.split_test)
                        ins.data_X_train.append(data_X_train)
                        ins.data_y_train.append(data_y_train)
                        ins.data_X_test.append(data_X_test)
                        ins.data_y_test.append(data_y_test)
                    else:
                        ins.data_X_train.append(data_X)
                        ins.data_y_train.append(data_y)

                def read_input_test(ins, path):
                    d = pd.read_csv(path, sep=ins.sep)
                    data_X = d.drop(ins.label, axis=1)
                    data_y = d[ins.label]
                    ins.data_X_test.append(data_X)
                    ins.data_y_test.append(data_y)

                if self.split_mode == 'directory':
                    for split_name in os.listdir(base_dir):
                        read_input_train(self, os.path.join(base_dir, split_name, self.input_train))
                        if self.input_test is not None:
                            read_input_test(self, os.path.join(base_dir, split_name, self.input_test))
                elif self.split_mode == 'file':
                    read_input_train(self, os.path.join(base_dir, self.input_train))
                    if self.input_test is not None:
                        read_input_test(self, os.path.join(base_dir, self.input_test))

                split_samples = [i.shape[0] for i in self.data_y_train]
                self.my_samples = sum(split_samples) // len(split_samples)

                print(f'Read input. Have {split_samples} samples.')

                if self.master:
                    self.data_incoming.append(pickle.dumps({
                        'samples': self.my_samples
                    }))
                    state = state_gather_1
                else:
                    self.data_outgoing = pickle.dumps({
                        'samples': self.my_samples
                    })
                    self.status_available = True
                    state = state_wait_1

            if state == state_train_local:
                print('Calculate local values...')

                rfs = []
                for i in range(len(self.data_X_train)):
                    global_rf = None
                    trees = int(self.estimators_total * self.my_samples / self.total_samples)
                    if self.mode == 'classification':
                        global_rf = RandomForestClassifier(n_estimators=trees, random_state=self.random_state)
                    elif self.mode == 'regression':
                        global_rf = RandomForestRegressor(n_estimators=trees, random_state=self.random_state)
                    global_rf.fit(self.data_X_train[i], self.data_y_train[i])
                    rfs.append({
                        'rf': global_rf,
                    })

                print(f'Trained random forests')

                if self.master:
                    self.data_incoming.append(pickle.dumps(rfs))
                    state = state_gather_2
                else:
                    self.data_outgoing = pickle.dumps(rfs)
                    self.status_available = True
                    state = state_wait_2

            if state == state_global_ready:
                print(f'Forest done')

                results_pred = []
                results_proba = []
                results_test = []
                for i in range(len(self.data_X_train)):
                    results_pred.append(self.rfs[i].predict(self.data_X_test[i]))
                    if self.mode == 'classification':
                        results_proba.append(self.rfs[i].predict_proba(self.data_X_test[i]))
                    results_test.append(self.data_y_test[i])

                def write_output(path, data):
                    df = pd.DataFrame(data=data)
                    df.to_csv(path, index=False, sep=self.sep)

                print(f'Writing output')
                base_dir_in = os.path.normpath(os.path.join(f'/mnt/input/', self.split_dir))
                base_dir_out = os.path.normpath(os.path.join(f'/mnt/output/', self.split_dir))
                if self.split_mode == 'directory':
                    for i, split_name in enumerate(os.listdir(base_dir_in)):
                        write_output(os.path.join(base_dir_out, split_name, self.output_pred), {'pred': results_pred[i][:]})
                        if self.mode == 'classification':
                            write_output(os.path.join(base_dir_out, split_name, self.output_proba), {'prob_0': results_proba[i][:, 0], 'prob_1': results_proba[i][:, 1]})
                        write_output(os.path.join(base_dir_out, split_name, self.output_test), {'y_true': results_test[i]})
                elif self.split_mode == 'file':
                    write_output(os.path.join(base_dir_out, self.output_pred), {'pred': results_pred[0][:]})
                    if self.mode == 'classification':
                        write_output(os.path.join(base_dir_out, self.output_proba), {'prob_0': results_proba[0][:, 0], 'prob_1': results_proba[0][:, 1]})
                    write_output(os.path.join(base_dir_out, self.output_test), {'y_true': results_test[0]})

                if self.master:
                    self.data_incoming.append('DONE')
                    state = state_finishing
                else:
                    self.data_outgoing = 'DONE'
                    self.status_available = True
                    break

            # GLOBAL PART

            if state == state_gather_1:
                if len(self.data_incoming) == len(self.clients):

                    client_data = []
                    for local_rfs in self.data_incoming:
                        client_data.append(pickle.loads(local_rfs))

                    self.data_incoming = []

                    total_samples = sum([cd['samples'] for cd in client_data])

                    self.total_samples = total_samples

                    self.data_outgoing = pickle.dumps(total_samples)
                    self.status_available = True
                    state = state_train_local

                else:
                    print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...')

            if state == state_gather_2:
                if len(self.data_incoming) == len(self.clients):

                    client_data = []
                    for local_rfs in self.data_incoming:
                        client_data.append(pickle.loads(local_rfs))

                    self.data_incoming = []

                    data_outgoing = []

                    for i in range(len(self.data_X_train)):
                        global_rf = None

                        # total_samples = 0
                        # for d in client_data:
                        #     total_samples += d[i]['samples']

                        for d in client_data:
                            drf = d[i]['rf']

                            # perc = d[i]['samples'] / total_samples
                            # trees = int(perc * self.estimators_total)

                            if global_rf is None:
                                global_rf = drf
                                global_rf.estimators_ = drf.estimators_
                                # global_rf.estimators_ = random.sample(drf.estimators_, trees)
                                global_rf.n_estimators = drf.n_estimators
                            else:
                                global_rf.estimators_ += drf.estimators_
                                # global_rf.estimators_ += random.sample(drf.estimators_, trees)
                                global_rf.n_estimators += drf.n_estimators

                        data_outgoing.append(global_rf)

                    self.rfs = data_outgoing

                    self.data_outgoing = pickle.dumps(data_outgoing)
                    self.status_available = True
                    state = state_global_ready

                else:
                    print(f'Have {len(self.data_incoming)} of {len(self.clients)} so far, waiting...')

            if state == state_finishing:
                if len(self.data_incoming) == len(self.clients):
                    self.status_finished = True
                    break

            # LOCAL PART

            if state == state_wait_1:
                if len(self.data_incoming) > 0:
                    self.total_samples = pickle.loads(self.data_incoming[0])
                    self.data_incoming = []

                    state = state_train_local

            if state == state_wait_2:
                if len(self.data_incoming) > 0:
                    self.rfs = pickle.loads(self.data_incoming[0])
                    self.data_incoming = []

                    state = state_global_ready

            time.sleep(1)
Example #12
0

######################################################################
n_estimators_space = [1, 10, 50, 100, 500, 1000]
n_estimators_space = np.arange(100)
n_estimators_space = np.linspace(1, 1000, dtype=int, endpoint=False, num=20)
n_estimators_space = np.logspace(0, 3, 10, dtype=int)
rf_scores = []
rf_scores_std = []

rfclass = RandomForestClassifier(n_estimators=500)
# Compute scores over range of alphas
for alpha in n_estimators_space:

    # Specify the alpha value to use: ridge.alpha
    rfclass.n_estimators = alpha

    # Perform 10-fold CV: ridge_cv_scores
    rf_cv_scores = cross_val_score(rfclass,
                                   X_train,
                                   y_train,
                                   cv=10,
                                   scoring='accuracy')

    # Append the mean of ridge_cv_scores to ridge_scores
    rf_scores.append(np.mean(rf_cv_scores))

    # Append the std of ridge_cv_scores to ridge_scores_std
    rf_scores_std.append(np.std(rf_cv_scores))

# Display the plot