def computeRandomForestCrossValidation(args, dict_algorithms): if (args.debug): print("Running random forest...", end='') model = RandomForest(args) dict_algorithms["random_forest"] = model.computeCrossValidation() if (args.debug): print("ok!")
def miss_forest_imputation(self, matrix_for_impute): """impute dataset and return self""" self.matrix_for_impute = matrix_for_impute self.raw_fill() self.previous_iter_matrix = np.copy(self.initial_guess_matrix) self.cur_iter_matrix = np.copy(self.initial_guess_matrix) cur_iter = 1 while True: if cur_iter > self.max_iter: self.result_matrix = self.previous_iter_matrix return print("Iteration " + str(cur_iter)) for var in self.vari: p = len(self.vart_) vt = self.vart_[var] cur_X = self.cur_iter_matrix cur_obsi = self.obsi[var] cur_misi = self.misi[var] if (len(cur_misi) == 0): continue p_train = np.delete(np.arange(p), var) X_train = cur_X[cur_obsi, :][:, p_train] y_train = cur_X[cur_obsi, :][:, var] X_test = cur_X[cur_misi, :][:, p_train] rf = RandomForest(self.params) imp = rf.fit_predict(X_train, y_train, X_test, vt) self.cur_iter_matrix[cur_misi, var] = imp if self.check_converge() == True: self.result_matrix = self.previous_iter_matrix return else: self.previous_iter_matrix = np.copy(self.cur_iter_matrix) cur_iter = cur_iter + 1
def miss_forest_imputation(self, matrix_for_impute): self.matrix_for_impute = matrix_for_impute self.raw_fill() vari_node = self.split_var() self.previous_iter_matrix = np.copy(self.initial_guess_matrix) self.cur_iter_matrix = np.copy(self.initial_guess_matrix) cur_iter = 1 while True: if cur_iter > self.max_iter: self.result_matrix = self.previous_iter_matrix return print("iteration " + str(cur_iter)) for i in range(len(vari_node)): cur_X = self.cur_iter_matrix x_path = self.handler.tmp_X_file with open(x_path, 'wb') as tmp: pickle.dump(cur_X, tmp) for j in range(len(vari_node[i])): #Prepare the jobs cur_vari = vari_node[i][j] cur_vart = [] cur_obsi = [] cur_misi = [] for k in range(len(vari_node[i][j])): cur_vart.append(self.vart_[cur_vari[k]]) cur_obsi.append(self.obsi[cur_vari[k]]) cur_misi.append(self.misi[cur_vari[k]]) argument_path = self.handler.get_arguments_varidx_file(i, j) result_path = self.handler.get_results_varidx_file(i, j) rf = RandomForest(self.params) with open(argument_path, 'wb') as tmp: argument_object = MissForestImputationSlurmArgumentObject(rf, cur_vart, cur_vari, cur_obsi, cur_misi) pickle.dump(argument_object, tmp) with open(result_path, 'wb') as tmp: # argument_object.results.done = False pickle.dump(argument_object.results, tmp) # write job.sh and submit command_shell = self.handler.get_command_shell(x_path, argument_path, result_path) command_shell =' '.join(command_shell) with open(self.handler.shell_script_path, 'w') as tmp: tmp.writelines('#!/bin/bash\n') tmp.writelines(command_shell) command = self.handler.get_command(i, j, cur_iter) subprocess.call(command) finish = False finished_ind = [False]*len(vari_node[i]) # finished_count = 0 while finish == False: time.sleep(0.1) finish = True for j in range(len(vari_node[i])): if finished_ind[j] == True: continue cur_vari = vari_node[i][j] cur_obsi = [] cur_misi = [] for k in range(len(vari_node[i][j])): cur_obsi.append(self.obsi[cur_vari[k]]) cur_misi.append(self.misi[cur_vari[k]]) result_path = self.handler.get_results_varidx_file(i, j) try: with open(result_path,'rb') as tmp: cur_result = pickle.load(tmp) if cur_result.done == False: finish = False break else: for k in range(len(cur_vari)): self.cur_iter_matrix[cur_misi[k],cur_vari[k]] = cur_result.imp_list[k] finished_ind[j] = True # if finished_ind.count(True) > finished_count: # finished_count = finished_ind.count(True) # print(finished_count, "/", len(finished_ind), "finished!") except Exception as e: finish = False break if self.check_converge() == True: self.result_matrix = self.previous_iter_matrix return #Update the previous_iter_matrix self.previous_iter_matrix = np.copy(self.cur_iter_matrix) cur_iter = cur_iter + 1
def test(Datasets, X, Y): for dataset, x, y in zip(Datasets, X, Y): aDT = [] aRF = [] index = [] print(f"{dataset.name}") print() print() for j in range(d.n): print(f"Test {j+1}") print() trainset = [] testset = [] for label, count in dataset[y].value_counts().items(): samples=dataset[dataset[y]==label] testsamples=samples.sample(int((1-d.pTrain)*count)) testset.append(testsamples) #https://stackoverflow.com/questions/28256761/select-pandas-rows-by-excluding-index-number trainsamples=samples[~samples.index.isin(testsamples.index)] trainset.append(trainsamples) Train = pd.concat(trainset).sample(frac=1) Test = pd.concat(testset).sample(frac=1) print("DECISION TREE - Train") DT = DecisionTree() DT.train(Train, x, y) print("RANDOM FOREST - Train") RF = RandomForest(d.num_trees, d.max_samples, d.max_X) RF.train(Train, x, y) print() print("DECISION TREE - Predict") aDT.append(accuracy(Test[y], DT.predict(Test))) print("RANDOM FOREST - Predict") aRF.append(accuracy(Test[y], RF.predict(Test))) index.append(f"Test{j+1}") print() print() a = pd.DataFrame(zip(np.around(aDT, 4), np.around(aRF, 4)), columns=["DT - Accuracy", "RF - Accuracy"], index=index) a.to_csv(f"{dataset.name}_accuracy.csv") mLDT = np.around(np.mean(aDT), 4) sLDT = np.around(np.std(aDT), 4) mRF = np.around(np.mean(aRF), 4) sRF = np.around(np.std(aRF), 4) print() print("Results:") print("DECISIONE TREE - Results:") print(f"Mean={mLDT} | STD={sLDT}") print() print("RANDOM FOREST - Results:") print(f"Mean={mRF} | STD={sRF}") results = pd.DataFrame([[mLDT, sLDT, mRF, sRF]], columns=["DT - Mean", "DT - STD", "RF - Mean", "RF - STD"]) results.to_csv(f"{dataset.name}_results.csv", index=False) print() print() print()
import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split from randomforest import RandomForest def accuracy(y_true,y_predict): acc = np.sum(y_true==y_predict)/len(y_true) return acc data = datasets.load_breast_cancer() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=22) clf = RandomForest(n_trees=3,max_depth=10) clf.fit(X_train,y_train) predictions = clf.predict(X_test) print(accuracy(y_test,predictions))