def predict_test_data(): forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points)) guesses = [] for i in range(TEST_SIZE): point = testing_data[i] guess = tree.predict(point) guesses.append(int(guess)) with open('titanic_1.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Id', 'Category']) i = 1 for g in guesses: writer.writerow([i, g]) i += 1
def run_model(): # load data train_file = 'data/hw7_train.dat.txt'; test_file = 'data/hw7_test.dat.txt' data_train = pd.read_csv(train_file, sep = ' ', header = None, names=[0, 1, 'y']) data_test = pd.read_csv(test_file, sep = ' ', header = None, names=[0, 1, 'y']) X_train, Y_train = generate_data(train_file); X_test, Y_test = generate_data(test_file) # train model col_y = 'y' T = 30000; max_height = 1 time_start = time.clock() RF_Prune = RandomForest() RF_Prune.construct_forest(data_train, col_y, size = T, max_height = max_height) print("Using %.3f seconds" % (time.clock() - time_start)) # model accuracy print('\n--- Pruned Random forest model accuarcy ---') Y_train_pred = [RF_Prune.predict(x) for x in np.array(X_train)] train_acc = np.sum(Y_train_pred == Y_train) / len(Y_train) * 100 print('Model accuracy on the training set: %.2f %%' %train_acc) Y_test_pred = [RF_Prune.predict(x) for x in np.array(X_test)] test_acc = np.sum(Y_test_pred == Y_test) / len(Y_test) * 100 print('Accuracy on the testing set: %.2f %%\n' %test_acc)
def run_kfold(method, kf, X, y, text, transformer=None): accuracy = 0 fold = 0 print("Running " + str(text)) for train_index, test_index in kf: print("Starting fold " + str(fold)) fold += 1 X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] if transformer is not None: t = transformer.fit(X_train) X_train = t.transform(X_train) X_test = t.transform(X_test) if method == "rf": clf = RandomForest(X_train, y_train, n_estimators=1000) clf.fit() elif method == "lr": clf = linear_model.RidgeClassifier(alpha=2) clf.fit(X_train, y_train) elif method == "ex": clf = ExtraTreesClassifier(n_estimators=2000) clf.fit(X_train, y_train) y_hat = clf.predict(X_test) accuracy += score(y_hat, y_test) return (accuracy * 1.0 / len(kf))
def classify_with_random_forest(): forest = RandomForest(num_trees = 250, max_depth = 7, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_training_points): prediction = forest.predict(training_data[i]) if prediction == training_labels[i]: num_right += 1 print("Training Accuracy: " + str(num_right / num_training_points)) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 print("Validation Accuracy: " + str(num_right / num_validation_points))
class BRAF(object): def __init__(self, S, p, k, weights, name='BRAF'): """ :param raw_data: specify the name of the csv file :param S: Spesify the size of the Biased Random Forest method :param p: Specify the ratio between R1 and R2 :param k: Specify the KN Nearest Neighbours for minority class """ self.S = S self.p = p self.k = k self.name = name self.weights = weights "Initialize the Forests" self.R1 = RandomForest('R1_Forest', self.weights, int(self.p * self.S), True) self.R2 = RandomForest('R2_Forest', self.weights, int((1 - self.p) * self.S), True) def fit(self, data): """ :param data: Read Data and preprocess for further analysis T is for Vanilla Random Forest and Tc is for biased forest :return: fitted R1 and R2 """ if data is not None: T, Tc = data print('fitting Biased Random Forest starts...') self.R1.fit(T[:, 0:-1], T[:, -1].astype(np.int)) self.R2.fit(Tc[:, 0:-1], Tc[:, -1].astype(np.int)) else: print("Data Not Found. Please check the file name and directory") def predict(self, x_test): """ :param x_test: receives the given x to predict :return: logits """ pred1 = self.R1.predict(x_test) pred2 = self.R2.predict(x_test) return (pred1 + pred2) / 2
def random_forests_classification(X, y, test_dat): classifier = RandomForest(20, round(math.sqrt(np.size(X, 1))), np.size(X, 0)) # classifier = RandomForest(1, round(math.sqrt(np.size(X, 1))), 100, 45) classifier.train(X, y) y_hat = classifier.predict(test_dat) f = open("census_predictions_random_forest.csv", 'w') f.write("Id,Category\n") for i in range(np.size(test_dat, 0)): f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n") f.close() print("DONE")
def best_params(): acc_max = 0 n_trees_max = 0 n_trees_list = [i for i in range(2, 11)] for n_tree in n_trees_list: clf = RandomForest(n_trees=n_tree) clf.fit(X_train, Y_train) predictions = clf.predict(X_test) acc = accuracy(Y_test, predictions) if acc > acc_max: acc_max = acc n_trees_max = n_tree return (n_trees_max, acc_max)
def graph_accuracy(): accuracy = [] num_trees = [] for j in range(5, 41, 5): forest = RandomForest(num_trees = j, max_depth = 10, categorical_vars = cat_set) forest.train(training_data, training_labels) num_right = 0 for i in range(num_validation_points): prediction = forest.predict(validation_data[i]) if prediction == validation_labels[i]: num_right += 1 accuracy.append(num_right / num_validation_points) num_trees.append(j) print(j) sys.stdout.flush() plt.figure() plt.plot(num_trees, accuracy) plt.title("Census Accuracy For Random Forest") plt.ylabel("Accuracy Rate") plt.xlabel("Number of Trees") plt.show()
from sklearn.model_selection import train_test_split from sklearn import datasets from RandomForest import RandomForest iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) model = RandomForest() model.fit(X_train, y_train) predictions = model.predict(X_test) print('prediction score: {}'.format(sum(predictions == y_test) / len(y_test)))
class Ensemble(object): def __init__(self): self.pca_randomForest = None self.pca_randomForest_norm = None self.pca_randomForest_pca = None self.rbm_lr_rbm = None self.rbm_lr = None self.texture_10_8 = None self.texture_5_10 = None self.texture_7_10 = None self.texture_9_8 = None self.texture_4_10 = None self.texture_20_8 = None self.ensemble_logistic_regression = None self.edge_pca_lr = None self.pca_edge_norm = None self.pca_edge_pca = None self.ip = ImagesProcessor() # Agregamos las predicciones aca porque no logramos pasarlas por referencia self.pca_randomForest_y_hat = None self.rbm_lr_y_hat = None self.texture_10_8_y_hat = None self.texture_5_10_y_hat = None def load(self): self.texture_10_8 = self._load_classifier('./ridgeClassifier_10_8') self.texture_5_10 = self._load_classifier('./ridgeClassifier_5_10') self.texture_7_10 = self._load_classifier('./ridgeClassifier_7_10') self.texture_9_8 = self._load_classifier('./ridgeClassifier_9_8') self.texture_4_10 = self._load_classifier('./ridgeClassifier_4_10') self.texture_20_8 = self._load_classifier('./ridgeClassifier_20_8') self.ensemble_logistic_regression = self._load_classifier('ensemble_logistic_regression') #pca_randomForest_pca = _load_classifier('./pca') #rbm_lr = _load_classifier('./rbm') def _load_classifier(self, path): f = file(path, 'r') classifier = cPickle.load(f) f.close() return classifier def fit_small(self, images, y): images_transformed, y_transformed = self.ip.transformImages(images, y, rotate=True, crop=True) t_t10_8 = threading.Thread(target=self._fit_small_texture10_8, args=(images[:], y, self.texture_10_8, 10, 8, 2)) t_t10_8.daemon = True t_t10_8.start() t_t5_10 = threading.Thread(target=self._fit_small_texture5_10, args=(images[:], y, self.texture_5_10, 5, 10, 2)) t_t5_10.daemon = True t_t5_10.start() t_t7_10 = threading.Thread(target=self._fit_small_texture7_10, args=(images[:], y, self.texture_7_10, 7, 10, 2)) t_t7_10.daemon = True t_t7_10.start() t_t9_8 = threading.Thread(target=self._fit_small_texture9_8, args=(images[:], y, self.texture_9_8, 9, 8, 2)) t_t9_8.daemon = True t_t9_8.start() t_t4_10 = threading.Thread(target=self._fit_small_texture4_10, args=(images[:], y, self.texture_4_10, 4, 10, 2)) t_t4_10.daemon = True t_t4_10.start() t_t20_8 = threading.Thread(target=self._fit_small_texture20_8, args=(images[:], y, self.texture_20_8, 20, 8, 2)) t_t20_8.daemon = True t_t20_8.start() t_pc = threading.Thread(target=self._fit_small_pc, args=(images_transformed[:], y_transformed)) t_pc.daemon = True t_pc.start() t_rbm = threading.Thread(target=self._fit_small_rbm, args=(images_transformed[:], y_transformed)) t_rbm.daemon = True t_rbm.start() t_t10_8.join() t_t5_10.join() t_t7_10.join() t_t9_8.join() t_t4_10.join() t_t20_8.join() t_pc.join() t_rbm.join() def _fit_small_texture10_8(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_10_8 = RidgeClassifier(ds, y, alpha=alpha) self.texture_10_8.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) # FIXE: unificar estas dos funciones. No le gusta pasar el estimador como atributo def _fit_small_texture5_10(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_5_10 = RidgeClassifier(ds, y, alpha=alpha) self.texture_5_10.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture7_10(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_7_10 = RidgeClassifier(ds, y, alpha=alpha) self.texture_7_10.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture9_8(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_9_8 = RidgeClassifier(ds, y, alpha=alpha) self.texture_9_8.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture4_10(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_4_10 = RidgeClassifier(ds, y, alpha=alpha) self.texture_4_10.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_texture20_8(self, images, y, estimator, radius, points, alpha): start_time = time.time() print("TEXTURE %d %d" % (radius, points)) ds = self.ip.getTextureFeature(images, radius, points) self.texture_20_8 = RidgeClassifier(ds, y, alpha=alpha) self.texture_20_8.fit() print("COMPLETE TEXTURE %d %d --- %s seconds ---" % (radius, points, time.time() - start_time)) def _fit_small_pc(self, images, y): start_time = time.time() print("PCA RANDOM FOREST") ds = self.ip.getImagesWithGrayHistogramEqualized(images=images) self.pca_randomForest_pca, self.pca_randomForest_norm, ds = self.ip.getPcaFeatures(ds, 150, Constants.IMAGES_SIZES) self.pca_randomForest = RandomForest(ds, y, n_estimators=2000) self.pca_randomForest.fit() print("COMPELTE PCA RANDOM FOREST --- %s seconds ---" %(time.time() - start_time)) def _fit_small_rbm(self, ds, y): start_time = time.time() print("RBM LR") ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES) ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001) self.rbm_lr_rbm = BernoulliRBM(random_state=0, verbose=True) self.rbm_lr_rbm.learning_rate = 0.01 self.rbm_lr_rbm.n_iter = 5 self.rbm_lr_rbm.n_components = 150 logistic = linear_model.RidgeClassifier(alpha=2) self.rbm_lr = Pipeline(steps=[('rbm', self.rbm_lr_rbm), ('lr', logistic)]) self.rbm_lr.fit(ds, y) print("COMPLETE RBM LR --- %s seconds ---" % (time.time() - start_time)) def fit_big(self, ds, y): self.ensemble_logistic_regression = linear_model.LogisticRegression() self.ensemble_logistic_regression.fit(ds, y) def predict_small(self, images): # t_predict_small_pac_ranfomForest = threading.Thread(target=self._predict_small_pac_ranfomForest, args=(images, )) # t_predict_small_pac_ranfomForest.daemon = True # t_predict_small_pac_ranfomForest.start() # t_predict_small_rbm_lr = threading.Thread(target=self._predict_small_rbm_lr, args=(images, )) # t_predict_small_rbm_lr.daemon = True # t_predict_small_rbm_lr.start() t_predict_small_texture_10_8 = threading.Thread(target=self._predict_small_texture_10_8, args=(images, )) t_predict_small_texture_10_8.daemon = True t_predict_small_texture_10_8.start() t_predict_small_texture_5_10 = threading.Thread(target=self._predict_small_texture_5_10, args=(images, )) t_predict_small_texture_5_10.daemon = True t_predict_small_texture_5_10.start() t_predict_small_texture_7_10 = threading.Thread(target=self._predict_small_texture_7_10, args=(images, )) t_predict_small_texture_7_10.daemon = True t_predict_small_texture_7_10.start() t_predict_small_texture_9_8 = threading.Thread(target=self._predict_small_texture_9_8, args=(images, )) t_predict_small_texture_9_8.daemon = True t_predict_small_texture_9_8.start() t_predict_small_texture_4_10 = threading.Thread(target=self._predict_small_texture_4_10, args=(images, )) t_predict_small_texture_4_10.daemon = True t_predict_small_texture_4_10.start() t_predict_small_texture_20_8 = threading.Thread(target=self._predict_small_texture_20_8, args=(images, )) t_predict_small_texture_20_8.daemon = True t_predict_small_texture_20_8.start() # t_predict_small_pac_ranfomForest.join() # t_predict_small_rbm_lr.join() t_predict_small_texture_10_8.join() t_predict_small_texture_5_10.join() t_predict_small_texture_9_8.join() t_predict_small_texture_4_10.join() t_predict_small_texture_20_8.join() t_predict_small_texture_7_10.join() return(np.vstack((self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T) #return(np.vstack((self.pca_randomForest_y_hat, self.rbm_lr_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T) #return(np.vstack((self.pca_randomForest_y_hat, self.texture_10_8_y_hat, self.texture_5_10_y_hat, self.texture_7_10_y_hat,self.texture_9_8_y_hat,self.texture_4_10_y_hat,self.texture_20_8_y_hat)).T) def _predict_small_rbm_lr(self, images): start_time = time.time() ds = images[:] ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES) ds = (ds - np.min(ds, 0)) / (np.max(ds, 0) + 0.0001) self.rbm_lr_y_hat = self.rbm_lr.predict(ds) print "Complete prediction RBM --- %s ---" % (time.time() - start_time) def _predict_small_pac_ranfomForest(self, images): start_time = time.time() ds = self.ip.getImagesWithGrayHistogramEqualized(images=images) ds = self.ip.getImagesAsDataset(ds, Constants.IMAGES_SIZES) ds = self.pca_randomForest_norm.transform(ds) ds = self.pca_randomForest_pca.transform(ds) self.pca_randomForest_y_hat = self.pca_randomForest.predict(ds) print "Complete prediction PCA --- %s ---" % (time.time() - start_time) def _predict_small_texture_10_8(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 10, 8) self.texture_10_8_y_hat = self.texture_10_8.predict(ds) print "Complete prediction Texture 10 8 --- %s ---" % (time.time() - start_time) def _predict_small_texture_5_10(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 5, 10) self.texture_5_10_y_hat = self.texture_5_10.predict(ds) print "Complete prediction Texture 5 10 --- %s ---" % (time.time() - start_time) def _predict_small_texture_7_10(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 7, 10) self.texture_7_10_y_hat = self.texture_7_10.predict(ds) print "Complete prediction Texture 7 10 --- %s ---" % (time.time() - start_time) def _predict_small_texture_9_8(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 9, 8) self.texture_9_8_y_hat = self.texture_9_8.predict(ds) print "Complete prediction Texture 9 8 --- %s ---" % (time.time() - start_time) def _predict_small_texture_4_10(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 4, 10) self.texture_4_10_y_hat = self.texture_4_10.predict(ds) print "Complete prediction Texture 4 10 --- %s ---" % (time.time() - start_time) def _predict_small_texture_20_8(self, images): start_time = time.time() ds = self.ip.getTextureFeature(images, 20, 8) self.texture_20_8_y_hat = self.texture_20_8.predict(ds) print "Complete prediction Texture 20 8 --- %s ---" % (time.time() - start_time) def predict_big(self, ds): return(self.ensemble_logistic_regression.predict(ds))
X = iris.data y = iris.target ratio_train_test = 0.85 num_samples, num_features = X.shape idx = np.random.permutation(range(num_samples)) num_samples_train = int(num_samples * ratio_train_test) idx_train = idx[:num_samples_train] idx_test = idx[num_samples_train:] X_train, y_train = X[idx_train], y[idx_train] X_test, y_test = X[idx_test], y[idx_test] # HYPER PARAMETERS max_depth = 7 min_split_size = 5 ratio_samples = 0.2 num_trees = 30 num_features_node = int(np.sqrt(num_features)) coefficient = 'gini' percentile = 90 values = None min_std_deviation = 0 rf = RandomForest(max_depth, min_split_size, ratio_samples, num_trees, num_features_node, coefficient, percentile, values, min_std_deviation) rf.train(X_train, y_train) rf.predict(X_test, y_test)
def main(): """ # ----------------------Iris------------------------ iris = sklearn.datasets.load_iris() print(iris.DESCR) X, y = iris.data, iris.target # -------------------------------------------------- """ """ # ----------------------Sonar------------------------ X, y = load_sonar() print(X.shape, y.shape) # --------------------------------------------------- """ """ # -------------------Iris i Sonar-------------------- ratio_train_test = 0.8 num_samples, num_features = X.shape idx = np.random.permutation(range(num_samples)) num_samples_train = int(num_samples*ratio_train_test) idx_train = idx[:num_samples_train] idx_test = idx[num_samples_train:] X_train,Y_train = X[idx_train], y[idx_train] X_test,Y_test = X[idx_test], y[idx_test] # --------------------------------------------------- """ # ----------------------MNIST------------------------ X_train, Y_train, X_test, Y_test = load() # --------------------------------------------------- num_trees = 10 max_depth = 10 # maxim nombre nivells arbre min_size_split = 5 # si elements al node < 5 ja no dividim ratio_samples = 0.8 # bagging num_trees = 10 criterion = "Gini" num_features_node = int(np.sqrt( X_train.shape[1])) # nombre de features diferents a consisderar # en cada norain num_samples_train = X_train.shape[0] num_samples_test = X_test.shape[0] logger.info("{} train and {} test samples".format(num_samples_train, num_samples_test)) try: start = timeit.default_timer() rf = RandomForest(max_depth, min_size_split, ratio_samples, num_trees, num_features_node, criterion) # ----------------------MNIST------------------------ rf.values = range(0, 156, 64) # --------------------------------------------------- rf.fit(X_train, Y_train) #print("Fit is done") Ypred = rf.predict(X_test) stop = timeit.default_timer() execution_time = (stop - start) / 60. logger.info("Program Executed in " + str(execution_time) + " minutes.") num_correct_predictions = np.sum(Ypred == Y_test) accuracy = num_correct_predictions / float(len(Y_test)) logger.info('accuracy {} %'.format(100 * np.round(accuracy, decimals=2))) logger.info("Ypred = {}".format(Ypred)) logger.info("Y_test = {}".format(Y_test)) logger.info("Y_test - Y_train = {}".format( np.array([Y_test[i] - Ypred[i] for i in range(len(Y_test))]))) except Exception as e: logger.critical("Failed on executing due to:\n{}".format(str(e)))
#change class label to integer by assifninf automatically print('Class labels', np.unique(df_wine['Class label'])) #print first five lines of dataframe print(df_wine.head()) for columns in df_wine.columns: print(columns) #require the second to all column as X, and the first column as y X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values #split the train data and test data by 70%, 30% radomly X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #Decision tree below #tree=DecisionTree(criterion='entropy', max_depth=6, random_state=None) #tree.fit(X_train, y_train, df_wine.columns[1:]) #y_pred=tree.predict(X_test) #Random Forest below forest=RandomForest(criterion='gini', n_estimators=20, max_features='auto', max_depth=3, min_samples_split = 2,random_state=None) forest.fit(X_train, y_train, df_wine.columns[1:]) y_pred=forest.predict(X_test) #mis-classified number print ("Misclassified samples/total test samples: %d/%d" %((y_test != y_pred).sum(), len(y_test))) #print (y_test) #print (y_pred) #print "The first sample probability is ", tree.predict_proba(X_test[0,:])
from DecisionTree import DecisionTree from RandomForest import RandomForest data = datasets.load_breast_cancer() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) print('Shape:', X_train.shape, X_test.shape, y_train.shape, y_test.shape) dt = DecisionTree(min_samples_split=3, max_depth=10, n_features=20) dt.fit(X_train, y_train) rf = RandomForest(n_trees=3, min_samples_split=3, max_depth=10, n_features=15) rf.fit(X_train, y_train) print('Parameters:', dt.min_samples_split, dt.max_depth, dt.n_features) y_pred_dt = dt.predict(X_test) acc_dt = accuracy_score(y_test, y_pred_dt) f1_dt = f1_score(y_test, y_pred_dt) y_pred_rf = rf.predict(X_test) acc_rf = accuracy_score(y_test, y_pred_rf) f1_rf = f1_score(y_test, y_pred_rf) print ("Accuracy score of Decision Tree:", acc_dt) print ("Accuracy score of Random Forest:", acc_rf) print ("F1 score of Decision Tree:", f1_dt) print ("F1 score of Random Forest:", f1_rf) # random forest overfiting for this small data
# pres = [] # falls = [] # thresholds = np.linspace(0, 1, 101) # for threshold in thresholds: # acc, pre, fall = cv_threshold(df_train_X, df_train_y, threshold=threshold) # accs.append(acc) # pres.append(pre) # falls.append(fall) # fig, ax = plt.subplots(figsize=(12, 6)) # #ax.plot(thresholds, accs) # ax.plot(pres, falls) # ax.set_xlabel("threshold") # ax.set_ylabel("pres") # ax.set_title("precision") #predict test_X with threshold 0.53 X_train = df_train_X.to_numpy() y_train = df_train_y.to_numpy() X_test = df_test_X.to_numpy() #model = LogisticRegression() #scores, arr = analyze_RF(X_train, y_train) model = RandomForest(num_trees=120, num_features=3) model.fit(X_train, y_train) y_hat = model.predict(X_test) submit_prediction(y_hat)
class Classifier(object): # classifiers with parameters LogReg = 1 Norm = 2 GMM = 3 kNN = 4 LinReg = 5 Perceptron = 6 MLP = 7 SVM = 8 DecisionTree = 9 RandomForest = 10 # classifiers without parameters NaiveBayes = 100 Gauss = 101 def __init__(self, classifier, parameters, featurespace): super(Classifier, self).__init__() self.__classifier = classifier self.__parameters = parameters self.__featurespace = featurespace self.__clf = None def copy(self): return Classifier(self.__classifier, self.__parameters, self.__featurespace) def initialize(self): self.__samples, self.__labels = self.__featurespace.getSamples() if len(self.__labels) == 0: self.__clf = None return if self.__classifier == self.LogReg: maxIter = self.__parameters.getLogRegMaxNumIterations() learningRate = self.__parameters.getLogRegLearningRate() self.__clf = LinearLogisticRegression(learningRate=learningRate, maxIterations=maxIter) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.Norm: norm = self.__parameters.getNormNorm() self.__clf = NormClassifier(norm) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.NaiveBayes: # self.__clf = naive_bayes.GaussianNB() # self.__clf.fit(self.__samples, self.__labels) self.__clf = GaussianClassifier(samplesIndependent=True) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.Gauss: self.__clf = GaussianClassifier(samplesIndependent=False) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.GMM: numComponents = self.__parameters.getGmmNumComponentsPerClass() maxIterations = self.__parameters.getGmmMaxNumIterations() self.__clf = GMMClassifier(numComponents, maxIterations) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.kNN: algo = self.__parameters.getKNNAlgorithm() k = self.__parameters.getKNNNumberOfNeighbors() w = self.__parameters.getKNNWeightFunction() if algo == 'scikit-learn': self.__clf = neighbors.KNeighborsClassifier(k, weights=w) else: # 'own' if k == 1: self.__clf = NearestNeighbor() else: self.__clf = kNearestNeighbor(k) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.LinReg: lossFunc = self.__parameters.getLinRegLossFunction() a = self.__parameters.getLinRegLossFunctionParam() self.__clf = LinearRegression(lossFunc, a, True) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.Perceptron: maxIter = self.__parameters.getPerceptronMaxNumIterations() learningRate = self.__parameters.getPerceptronLearningRate() batchMode = self.__parameters.getPerceptronBatchMode() self.__clf = Perceptron(batchMode=batchMode, learningRate=learningRate, maxIterations=maxIter) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.MLP: layers = self.__parameters.getMLPHiddenLayers() act = self.__parameters.getMLPActivationFunction() algo = self.__parameters.getMLPOptimizationAlgorithm() alpha = self.__parameters.getMLPAlpha() rate = self.__parameters.getMLPLearningRate() self.__clf = sklearn.neural_network.MLPClassifier( hidden_layer_sizes=layers, activation=act, algorithm=algo, alpha=alpha, learning_rate=rate) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.SVM: algorithm = self.__parameters.getSVMAlgorithm() kernel = self.__parameters.getSVMKernel() C = self.__parameters.getSVMC() gamma = self.__parameters.getSVMGamma() coef0 = self.__parameters.getSVMCoef0() degree = self.__parameters.getSVMDegree() if algorithm == 'LinearSVC': self.__clf = svm.LinearSVC(C=C) elif algorithm == 'SVC': self.__clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, coef0=coef0, degree=degree) elif algorithm == 'HardMarginSVM': self.__clf = HardMarginSVM() elif algorithm == 'SoftMarginSVM': self.__clf = SoftMarginSVM(C=C) else: self.__clf = KernelSVM(C=C, gamma=gamma) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.DecisionTree: algorithm = self.__parameters.getDecisionTreeAlgorithm() criterion = self.__parameters.getDecisionTreeCriterion() splitter = self.__parameters.getDecisionTreeSplitter() maxDepth = self.__parameters.getDecisionTreeMaxDepth() minSamplesSplit = self.__parameters.getDecisionTreeMinSamplesSplit( ) minSamplesLeaf = self.__parameters.getDecisionTreeMinSamplesLeaf() minWeightedFractionLeaf = self.__parameters.getDecisionTreeMinWeightedFractionLeaf( ) maxLeafNodes = self.__parameters.getDecisionTreeMaxLeafNodes() trials = self.__parameters.getDecisionTreeNumTrialsPerSplit() if algorithm == 'sklearn': self.__clf = tree.DecisionTreeClassifier( criterion=criterion, splitter=splitter, max_features=2, max_depth=maxDepth, min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightedFractionLeaf, max_leaf_nodes=maxLeafNodes) else: self.__clf = DecisionTree(maxDepth, minSamplesLeaf, trials) self.__clf.fit(self.__samples, self.__labels) elif self.__classifier == self.RandomForest: algorithm = self.__parameters.getRandomForestAlgorithm() numTrees = self.__parameters.getRandomForestNumTrees() criterion = self.__parameters.getRandomForestCriterion() maxDepth = self.__parameters.getRandomForestMaxDepth() minSamplesSplit = self.__parameters.getRandomForestMinSamplesSplit( ) minSamplesLeaf = self.__parameters.getRandomForestMinSamplesLeaf() minWeightedFractionLeaf = self.__parameters.getRandomForestMinWeightedFractionLeaf( ) maxLeafNodes = self.__parameters.getRandomForestMaxLeafNodes() trials = self.__parameters.getRandomForestNumTrialsPerSplit() # print('Num trees: {0}'.format(numTrees)) # print('Max depth: {0}'.format(maxDepth)) # print('Min samples split: {0}'.format(minSamplesSplit)) # print('Min samples leaf: {0}'.format(minSamplesLeaf)) # print('Min weighted fraction leaf: {0}'.format(minWeightedFractionLeaf)) # print('Max leaf nodes: {0}'.format(maxLeafNodes)) # print('Num trials per node: {0}'.format(trials)) if algorithm == 'sklearn': self.__clf = ensemble.RandomForestClassifier( n_estimators=numTrees, criterion=criterion, max_features=2, max_depth=maxDepth, min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightedFractionLeaf, max_leaf_nodes=maxLeafNodes) else: self.__clf = RandomForest(numTrees, maxDepth, minSamplesLeaf, trials) self.__clf.fit(self.__samples, self.__labels) else: print("unsupported classifier") def runFeatureSpaceComputations(self): if self.__clf: x_min, y_min, x_max, y_max = self.__featurespace.coordinateSystem.getLimits( ) ppuX, ppuY = self.__featurespace.coordinateSystem.getPixelsPerUnit( ) stepsize = 1.0 / ppuX xrange = numpy.arange(x_min, x_max, stepsize) w = len(xrange) stepsize = 1.0 / ppuY yrange = numpy.arange(y_max, y_min, -stepsize) h = len(yrange) xx, yy = numpy.meshgrid(xrange, yrange) data = numpy.c_[xx.ravel(), yy.ravel()] Z = self.__clf.predict(data) if Z is None: return None Z = Z.astype(numpy.int64) for k in range(Parameters.NUMBER_SUPPORTED_CLASSES): col, _, _ = MyColors.rgbForClass(k) Z = numpy.where(Z == k, col, Z) Z = Z.astype(numpy.int32) img = QtGui.QImage(Z, w, h, QtGui.QImage.Format_RGB32) # img.save('test.png') return img else: return None
def main(cv=False,kaggle=True, num_Trees=10, verbose=False): X = [] y = [] # Load data set with open("hw4-data.csv") as f: next(f, None) for line in csv.reader(f, delimiter = ","): X.append(line[:-1]) y.append(line[-1]) #end X = np.array(X, dtype = float) y = np.array(y, dtype = int) # Split training/test sets # You need to modify the following code for cross validation if cv == True: K = 10 cv_accuracy =[] for ii in xrange(K): X_train = np.array([x for i, x in enumerate(X) if i % K != ii], dtype = float) y_train = np.array([z for i, z in enumerate(y) if i % K != ii], dtype = int) X_test = np.array([x for i, x in enumerate(X) if i % K == ii], dtype = float) y_test = np.array([z for i, z in enumerate(y) if i % K == ii], dtype = int) randomForest = RandomForest(num_trees=num_Trees, verbose=verbose) t0 = time() randomForest.fit(X_train, y_train) t1 = time() print "time elapses = %.3f s" % (t1-t0) y_predicted = randomForest.predict(X_test) results = [prediction == truth for prediction, truth in zip(y_predicted, y_test)] # Accuracy accuracy = float(results.count(True)) / float(len(results)) print "test accuracy: %.4f" % accuracy cv_accuracy.append(accuracy) print "average cv accuracy: %.4f" % np.mean(cv_accuracy) else: ii = 3 K = 10 X_train = np.array([x for i, x in enumerate(X) if i % K != ii], dtype = float) y_train = np.array([z for i, z in enumerate(y) if i % K != ii], dtype = int) X_test = np.array([x for i, x in enumerate(X) if i % K == ii], dtype = float) y_test = np.array([z for i, z in enumerate(y) if i % K == ii], dtype = int) if kaggle==True: randomForest = RandomForest(num_trees=num_Trees, verbose=verbose) t0 = time() # randomForest.fit(X_train,y_train) randomForest.fit(X,y) #use the full data t1 = time() print "time elapses = %.3f s" % (t1-t0) # y_predicted = randomForest.predict(X_test) # results = [prediction == truth # for prediction,truth in zip(y_predicted,y_test)] # # Accuracy # accuracy = float(results.count(True)) / float(len(results)) # print "test accuracy: %.4f" % accuracy generateSubmissionFile(myname, randomForest) else: randomForest = RandomForest(num_trees=num_Trees, verbose=verbose) t0 = time() randomForest.fit(X_train,y_train) t1 = time() print "time elapses = %.3f s" % (t1-t0) y_predicted = randomForest.predict(X_test) results = [prediction == truth for prediction,truth in zip(y_predicted,y_test)] accuracy = float(results.count(True)) / float(len(results)) print "test accuracy: %.4f" % accuracy