Exemple #1
0
def computeRandomForestCrossValidation(args, dict_algorithms):
    if (args.debug):
        print("Running random forest...", end='')
    model = RandomForest(args)
    dict_algorithms["random_forest"] = model.computeCrossValidation()
    if (args.debug):
        print("ok!")
    def miss_forest_imputation(self, matrix_for_impute):
        """impute dataset and return self"""
        self.matrix_for_impute = matrix_for_impute
        self.raw_fill()

        self.previous_iter_matrix = np.copy(self.initial_guess_matrix)
        self.cur_iter_matrix = np.copy(self.initial_guess_matrix)
        cur_iter = 1

        while True:
            if cur_iter > self.max_iter:
                self.result_matrix = self.previous_iter_matrix
                return
            print("Iteration " + str(cur_iter))

            for var in self.vari:
                p = len(self.vart_)
                vt = self.vart_[var]
                cur_X = self.cur_iter_matrix
                cur_obsi = self.obsi[var]
                cur_misi = self.misi[var]
                if (len(cur_misi) == 0):
                    continue
                p_train = np.delete(np.arange(p), var)
                X_train = cur_X[cur_obsi, :][:, p_train]
                y_train = cur_X[cur_obsi, :][:, var]
                X_test = cur_X[cur_misi, :][:, p_train]
                rf = RandomForest(self.params)
                imp = rf.fit_predict(X_train, y_train, X_test, vt)
                self.cur_iter_matrix[cur_misi, var] = imp

            if self.check_converge() == True:
                self.result_matrix = self.previous_iter_matrix
                return
            else:
                self.previous_iter_matrix = np.copy(self.cur_iter_matrix)
                cur_iter = cur_iter + 1
Exemple #3
0
    def miss_forest_imputation(self, matrix_for_impute):
        self.matrix_for_impute = matrix_for_impute
        self.raw_fill()

        vari_node = self.split_var()
        self.previous_iter_matrix = np.copy(self.initial_guess_matrix)
        self.cur_iter_matrix = np.copy(self.initial_guess_matrix)
        cur_iter = 1
        
        while True:
            if cur_iter > self.max_iter:
                self.result_matrix = self.previous_iter_matrix
                return
            print("iteration " + str(cur_iter))
            
            for i in range(len(vari_node)):
                cur_X = self.cur_iter_matrix
                x_path = self.handler.tmp_X_file
                with open(x_path, 'wb') as tmp:
                    pickle.dump(cur_X, tmp)
                for j in range(len(vari_node[i])):
                    #Prepare the jobs
                    cur_vari = vari_node[i][j]
                    cur_vart = []
                    cur_obsi = []
                    cur_misi = []
                    for k in range(len(vari_node[i][j])):
                        cur_vart.append(self.vart_[cur_vari[k]])
                        cur_obsi.append(self.obsi[cur_vari[k]])
                        cur_misi.append(self.misi[cur_vari[k]])

                    argument_path = self.handler.get_arguments_varidx_file(i, j)
                    result_path = self.handler.get_results_varidx_file(i, j)
                    rf = RandomForest(self.params)
                    with open(argument_path, 'wb') as tmp:
                        argument_object = MissForestImputationSlurmArgumentObject(rf, cur_vart, cur_vari, cur_obsi, cur_misi)
                        pickle.dump(argument_object, tmp)
                    with open(result_path, 'wb') as tmp:
                        # argument_object.results.done = False
                        pickle.dump(argument_object.results, tmp)
                    
                    # write job.sh and submit
                    command_shell = self.handler.get_command_shell(x_path, argument_path, result_path)
                    command_shell =' '.join(command_shell)
                    with open(self.handler.shell_script_path, 'w') as tmp:
                        tmp.writelines('#!/bin/bash\n')
                        tmp.writelines(command_shell)
                    command = self.handler.get_command(i, j, cur_iter)
                    subprocess.call(command)
                
                finish = False
                finished_ind = [False]*len(vari_node[i])
                # finished_count = 0
                while finish == False:
                    time.sleep(0.1)
                    finish = True
                    for j in range(len(vari_node[i])):
                        if finished_ind[j] == True:
                            continue
                            
                        cur_vari = vari_node[i][j]
                        cur_obsi = []
                        cur_misi = []
                        for k in range(len(vari_node[i][j])):
                            cur_obsi.append(self.obsi[cur_vari[k]])
                            cur_misi.append(self.misi[cur_vari[k]])
                            
                        result_path = self.handler.get_results_varidx_file(i, j)
                        try:
                            with open(result_path,'rb') as tmp:
                                cur_result = pickle.load(tmp)
                                if cur_result.done == False:
                                    finish = False
                                    break
                                else:
                                    for k in range(len(cur_vari)):
                                        self.cur_iter_matrix[cur_misi[k],cur_vari[k]] = cur_result.imp_list[k]
                                    finished_ind[j] = True

                            # if finished_ind.count(True) > finished_count:
                            #     finished_count = finished_ind.count(True)
                            #     print(finished_count, "/", len(finished_ind), "finished!")
                                
                        except Exception as e:
                            finish = False
                            break

            if self.check_converge() == True:
                self.result_matrix = self.previous_iter_matrix
                return
                
            #Update the previous_iter_matrix
            self.previous_iter_matrix = np.copy(self.cur_iter_matrix)
            
            cur_iter = cur_iter + 1
Exemple #4
0
def test(Datasets, X, Y):
    for dataset, x, y in zip(Datasets, X, Y):
        aDT = []
        aRF = []
        index = []
        print(f"{dataset.name}")
        print()
        print()
        for j in range(d.n):
            print(f"Test {j+1}")
            print()
            trainset = []
            testset = []
            for label, count in dataset[y].value_counts().items():
                samples=dataset[dataset[y]==label]

                testsamples=samples.sample(int((1-d.pTrain)*count))
                testset.append(testsamples)

                #https://stackoverflow.com/questions/28256761/select-pandas-rows-by-excluding-index-number
                trainsamples=samples[~samples.index.isin(testsamples.index)]
                trainset.append(trainsamples)

            Train = pd.concat(trainset).sample(frac=1)
            Test = pd.concat(testset).sample(frac=1)

            print("DECISION TREE - Train")
            DT = DecisionTree()
            DT.train(Train, x, y)


            print("RANDOM FOREST - Train")
            RF = RandomForest(d.num_trees, d.max_samples, d.max_X)
            RF.train(Train, x, y)
            print()

            print("DECISION TREE - Predict")
            aDT.append(accuracy(Test[y], DT.predict(Test)))
            print("RANDOM FOREST - Predict")
            aRF.append(accuracy(Test[y], RF.predict(Test)))

            index.append(f"Test{j+1}")
            print()
            print()

        a = pd.DataFrame(zip(np.around(aDT, 4), np.around(aRF, 4)), columns=["DT - Accuracy", "RF - Accuracy"], index=index)
        a.to_csv(f"{dataset.name}_accuracy.csv")

        mLDT = np.around(np.mean(aDT), 4)
        sLDT = np.around(np.std(aDT), 4)
        mRF = np.around(np.mean(aRF), 4)
        sRF = np.around(np.std(aRF), 4)

        print()
        print("Results:")
        print("DECISIONE TREE - Results:")
        print(f"Mean={mLDT} | STD={sLDT}")
        print()
        print("RANDOM FOREST - Results:")
        print(f"Mean={mRF} | STD={sRF}")

        results = pd.DataFrame([[mLDT, sLDT, mRF, sRF]], columns=["DT - Mean", "DT - STD", "RF - Mean", "RF - STD"])
        results.to_csv(f"{dataset.name}_results.csv", index=False)

        print()
        print()
        print()
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from randomforest import RandomForest

def accuracy(y_true,y_predict):
    acc = np.sum(y_true==y_predict)/len(y_true)
    return acc
data = datasets.load_breast_cancer()

X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=22)

clf = RandomForest(n_trees=3,max_depth=10)
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)

print(accuracy(y_test,predictions))