def run(self): forest = RandomForest() boost = GradientBoost() train_x, train_y, test_x, test_y, feature_names = self.load( "./data/parker_sleeps.csv") forest.train(train_x, train_y, feature_names) f_accuracy, forest_predictions = forest.predict(test_x, test_y) print(" Results") print("--------- Random Forest Technique --------- \n\n") print("Feature number\tFeatures\tPrediction\tActual") for i in range(len(test_x)): print("{0}\t{1}\t{2}\t{3}".format(i, train_x.iloc[i].values, forest_predictions[i], test_y.iloc[i].values[0])) print("\nRandom Forest Accuracy (MSE) {:.4f}".format(f_accuracy)) boost.train(train_x, train_y, feature_names) b_accuracy, gb_predictions = boost.predict(test_x, test_y) print("\n\n--------- Gradient Boost Technique --------- \n\n") print("Feature number\tFeatures\tPrediction\tActual") for i in range(len(test_x)): print("{0}\t{1}\t{2:.2f}\t{3}".format(i, train_x.iloc[i].values, gb_predictions[i], test_y.iloc[i].values[0])) print("\nGradient Boost Accuracy (MSE) {:.4f}".format(b_accuracy))
class Stacking(): def __init__(self): pass def fit(self, X, y): self.rf = RandomForest(num_trees=15, max_depth=np.inf) self.rf.fit(X, y) y_rf = self.rf.predict(X) self.nb = NaiveBayes() self.nb.fit(X, y) y_nb = self.nb.predict(X) self.knn = KNN(k=3) self.knn.fit(X, y) y_knn = self.knn.predict(X) newX = np.array([y_rf, y_nb, y_knn]).transpose() model = DecisionTree(max_depth=np.inf, stump_class=DecisionStumpErrorRate) self.model = model model.fit(newX, y) def predict(self, X): y_rf = self.rf.predict(X) y_nb = self.nb.predict(X) y_knn = self.knn.predict(X) x_test = np.array([y_rf, y_nb, y_knn]).transpose() return self.model.predict(x_test)
def main(argv): if len(argv) != 3: sys.exit(f"Usage python3 {argv[0]} <testing_file> <tree_json_dir>") rna_ids, _, test_x = parse_data.read_data(argv[1], skip_header=False, delimiter=",") json_files = glob.glob(f"{argv[2]}/*.json") forest = RandomForest() weights_filename = f"{argv[2]}/tree_weights.json" json_files.remove(weights_filename) weights = {} with open(weights_filename, "r") as weights_file: weights = json.loads(weights_file.read()) for filename in json_files: with open(filename, "r") as tree_file: tree = DecisionTree.from_json(tree_file.read()) forest.add_tree(tree) forest.weights.append(weights[filename]) for i, x in enumerate(test_x): prediction, confidence = forest.predict_with_confidence(x) if prediction == 0.0: confidence = 1 - confidence print(f"{rna_ids[i]},{confidence}")
def main(): """ N.B. Last DataFrame Column contains labels """ logger = logging.getLogger(__name__) logger.debug('read data') dframe_train = pd.read_excel(os.path.join(input_filepath, "train_data.xlsx"), index_col=0) logger.debug('train model') '''CREATE SINGLE TREE''' d_t = DecisionTree(metrics = 'entropy') #max_depth = 8 #trained_dt = dt.build_tree(dframe,header) #prediction = classify(small_train.values[0][:-1],t0) '''CREATE RANDOM FOREST WITH TREES d_t''' r_f = RandomForest(decision_tree_type=d_t, n_trees=20) r_f = r_f.build_forest(dframe_train, n_selected_features="best", sample_ratio =.8) '''GET MODEL ACCURACY ON VALIDATION DATA''' logger.debug('get model accuracy') dframe_val = pd.read_excel(os.path.join(input_filepath, "validate_data.xlsx"), index_col=0) predictions_validation = r_f.get_model_accuracy(dframe_val.columns.values.tolist(), dframe_val) #logger.debug('single prediction') #rf.classify_forest(dframe_val.columns.values.tolist(),dframe_val.values[0],forest) logger.debug('save model') save_model(output_filepath, "model_00.npy", r_f)
def run_random_forest(data, target_column): st.sidebar.title('Choose parameters for Random Forest') ts = st.sidebar.slider('Training size', min_value=0.0, max_value=1.0, step=0.01, value=0.7) n_estimators = st.sidebar.number_input('n_estimators', min_value=1, max_value=1000, step=1) n_features = st.sidebar.number_input('n_features', min_value=1, max_value=len(data.columns)-1, step=1, value=len(data.columns)-1) bootstrap_size = st.sidebar.number_input('bootstrap_size', min_value=1, max_value=int(len(data)*ts), step=1, value=int(len(data)*ts)) if st.sidebar.checkbox('Specify Depth'): max_depth = st.sidebar.number_input('max_depth', min_value=1, max_value=int(len(data)*ts), step=1) else: max_depth = None run_status = st.sidebar.button('Run Algorithm') if run_status: with st.spinner('Running...'): x_train, x_test, y_train, y_test = train_test_split(data.drop([target_column], axis=1), data[target_column], test_size=1 - ts) clf = RandomForest(n_estimators=n_estimators, n_features=n_features, max_depth=max_depth, bootstrap_size=bootstrap_size) clf.fit(x_train, y_train) """ ## :dart: Accuracy """ st.subheader(accuracy_score(y_test, clf.predict(x_test)))
def validate(self, folds, num_trees): attributes = folds[0][0] ok = 0 error = 0 for fold in folds: del fold[0] for i in range(len(folds)): folds_copy = copy.deepcopy(folds) test = copy.deepcopy(folds_copy[i]) del folds_copy[i] training = sum(folds_copy, []) training.insert(0, attributes) forest = RandomForest(copy.deepcopy(training)) trees = forest.get_forest(num_trees) for instance in test: real_class = instance[len(instance)-1] del instance[len(instance)-1] prediction = forest.predict(trees, instance) if (prediction == real_class): ok = ok + 1 else: error = error + 1 print("Total: " + str(ok+error)) print("Corretas: " + str(ok)) print("Erradas: " + str(error)) print("Porcentagem: " + str(ok/(ok+error)))
def main(argv): """ entry point to the program """ if len(argv) != 3: sys.exit(f"Usage python3 {argv[0]} <testing_file> <tree_json_dir>") _, test_y, test_x = parse_data.read_data(argv[1], skip_header=False, delimiter=",") json_files = glob.glob(f"{argv[2]}/*.json") forest = RandomForest() for filename in json_files: with open(filename, "r") as tree_file: tree = DecisionTree.from_json(tree_file.read()) forest.add_tree(tree) total_right = 0 for i, point in enumerate(test_x): expected = forest.predict(point) if test_y[i] == expected: total_right += 1 accuracy = total_right / len(test_y) print(f"Accuracy: {accuracy}")
def main(): predictorRF = RandomForest() predicatorSVM = SVM() predicatorLogistic_Regression = Logistic_Regression() print('========Random Forest============') predictorRF.run() print('========SVM============') predicatorSVM.run() print('========Logistic Regression============') predicatorLogistic_Regression.run()
def main(): columns, x_train, y_train, x_test, y_test = preprocessing() random_forest_ID3 = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.ID3, np.vstack((x_train, x_test)), 10) decision_tree_ID3 = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.ID3) random_forest_GINI = RandomForest(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.GINI, np.vstack((x_train, x_test)), 10) decision_tree_GINI = DecisionTreeClassifier(columns[:-1], ['age', 'hours-per-week', 'capital-gain', 'capital-loss'], Criterion.GINI) decision_tree_ID3.set_attribute_values(np.vstack((x_train, x_test))) decision_tree_GINI.set_attribute_values(np.vstack((x_train, x_test))) validation = Validation(x_train, y_train, x_test, y_test) print('K-fold validation:\n\n') print('Criteri ID3:\n') print('Random forest:\n') score = validation.score_cross_val(3, random_forest_ID3) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Decision tree:\n') score = validation.score_cross_val(3, decision_tree_ID3) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Criteri GINI:\n') print('Random forest:\n') score = validation.score_cross_val(3, random_forest_GINI) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Decision tree:\n') score = validation.score_cross_val(3, decision_tree_GINI) print(f'Accuracy mitjana: {np.array(score[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(score[Measure.SPEC]).mean()}\n') print('Final model: Random Forest\n') print('Resultats finals: \n') final_measure = validation.final_measure(random_forest_ID3) print(f'Accuracy mitjana: {np.array(final_measure[Measure.ACC]).mean()}\n') print(f'Specificity mitjana: {np.array(final_measure[Measure.SPEC]).mean()}\n') print('\n\n Exemple d arbre de decisió entrenat amb totes les dades disponible a out/resultat.txt') #Imprimim un arbre de decisió entrenat amb totes les dades, per visualitzar, tot i no ser el millor model x_data = np.vstack((x_train, x_test)) y_data = np.hstack((y_train, y_test)) decision_tree_ID3.fit(x_data, y_data) write_to_file(decision_tree_ID3)
def main(cl_args=sys.argv[1:]): """Main wrapper to run the classification app.""" args = parse_command_line_args(cl_args=cl_args) datafile = os.path.realpath(args["data_path"]) testfile = os.path.realpath(args["test_path"]) output_root = os.path.realpath(args["output_path"]) random_seed = args["random_seed"] info_level = "INFO" if args["verbose"]: info_level = "DEBUG" # Configure the logger. logging.basicConfig(format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', level=info_level) num_trees = args["num_trees"] if not os.path.exists(output_root): os.makedirs(output_root) # annotated training data data = fileio(datafile) # annotated test data testdata=fileio(testfile) # Invoking unit test for decision tree if args["run_tests"]: logger.info("Invoking test case for decision trees") suite = unittest.TestLoader().loadTestsFromTestCase(TestDecisionTree) unittest.TextTestRunner().run(suite) sys.exit() # Creating a set of test points from test data test_points=[] for line in testdata: test_points.append(line[0:len(line)-1]) # Single decision tree for entire credit approval dataset tree_credit = DecisionTree() logger.info('Commencing single Decision Tree for credit approval data') start_time = time.time() tree_credit.build_tree(data) end_time = time.time() logger.info('time_lapsed: {:0.4f}'.format(end_time - start_time)) tree_credit.drawtree(jpeg=os.path.join(output_root, 'singletree.png')) # Random forest of decision trees for the credit approval dataset logger.info('Commencing single Random Forest for credit approval data') forest = RandomForest(num_trees) start_time = time.time() forest.build_forest(data, output_path=output_root, seed=random_seed) end_time = time.time() logger.info('Time to build forest: {:0.4f}'.format(end_time - start_time)) evaluate(test_points, forest, testdata)
def run_randomforest(train_examples, train_labels, attributes, test_examples, test_labels, n_trees): amin, bim = get_data(train_examples, test_labels) rforest = RandomForest(entropy, 2, n_trees, len(attributes)) if (bim > 0): rforest.train_dataset(train_examples, attributes, train_labels) else: clabel = None error = 0 if (amin == 0): preds, error = rforest.test_dataset(test_examples, test_labels) return error
def main(): data = datasets.load_digits() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) print("X_train.shape:", X_train.shape) print("Y_train.shape:", y_train.shape) n_features = X_train.shape[1] clf = RandomForest(n_estimators=100, max_features=int(np.sqrt(n_features))) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Our RF Accuracy:", accuracy) clf = ensemble.RandomForestClassifier(n_estimators=100, max_features=int( np.sqrt(n_features))) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("sklearn RF Accuracy:", accuracy)
def test_random_forest(self): iris = load_iris() X = iris['data'] y = iris['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = RandomForest(num_trees=5, max_depth=5) model.fit(X_train, y_train) preds = model.predict(X_test) score = accuracy_score(y_test, preds) self.assertGreater(score, 0.9)
def cross_validation(dataset, attributes, percentage_train, folds, ntrees, nattributes=-1, depth_limit=None): """ :param dataset: the full dataset :param attributes: the attributes :param percentage_train: float, percentage of instances that needs to go to the train partition :param folds: int, number of holdouts to execute :param ntrees: int, number of trees for each ensemble :param nattributes: int, number of sampled attributes (if -1, use sqrt(total)) :return: (average_performance, stddev_performance) """ if nattributes == -1: nattributes = int(math.sqrt(len(dataset.columns))) accuracies = [] precisions = [] recalls = [] for fold in range(1, folds + 1): train_dataset, test_dataset = holdout(dataset, percentage_train) rf = RandomForest(train_dataset, attributes, ntrees, nattributes, depth_limit) accuracy, precision, recall = test_RF(rf, test_dataset) accuracies.append(accuracy) precisions.append(precision) recalls.append(recall) return mean(accuracies), stdev(accuracies), mean(precisions), stdev( precisions), mean(recalls), stdev(recalls)
def __init__(self, train_size=0.7): self.predictors = [ MyLogisticRegression(scale_data=True), RandomForest(6, 3, 200), SVC()] self.blender = MyKNN(type='classifier', use_weights=False) # DecisionTree(2, 1) self.train_size = train_size
def evaluate_performance(trials=100): filename = '../data/SPECTF.dat' data = np.loadtxt(filename, delimiter=',') X = data[:, 1:] y = data[:, 0] n, d = X.shape accuracy_list = np.zeros(trials) precision_list = np.zeros(trials) recall_list = np.zeros(trials) for trial in range(trials): print 'Trial #', trial # Random shuffle idx = np.arange(n) np.random.seed(np.int(time() / 150)) np.random.shuffle(idx) X = X[idx] Y = y[idx] # Split Train and Test samples train = np.int(0.8 * n) Xtrain = X[:train, :] Xtest = X[train:, :] Ytrain = Y[:train] Ytest = Y[train:] clf = RandomForest(n_trees=30, max_depth=100, ratio_per_tree=0.7, ratio_features=0.3) clf.fit(Xtrain, Ytrain) pred = clf.predict(Xtest) accuracy_list[trial] = accuracy_score(Ytest, pred) precision_list[trial] = precision(Ytest, pred) recall_list[trial] = recall(Ytest, pred) stats = np.zeros((3, 3)) stats[0, 0] = np.mean(accuracy_list) stats[0, 1] = np.std(accuracy_list) stats[1, 0] = np.mean(precision_list) stats[1, 1] = np.std(precision_list) stats[2, 0] = np.mean(recall_list) stats[2, 1] = np.std(recall_list) return stats
def run(self) -> None: """Creates a random forest model, trains it, and saves it.""" model = RandomForest(self.window_args, self.forest_args) save_model(self.name, model) self.signals.float_result.emit(model.accuracy)
def fit(self, X, y): # instantiate the input models rf = RandomForest(num_trees=15) knn = KNN(k=3) nb = NaiveBayes(num_classes=2) # Random Forest fit and predict rf.create_splits(X) rf.fit(X, y) rf_pred = rf.predict(X) # K-Nearest Neighbors fit and predict knn.fit(X, y) knn_pred = knn.predict(X) # Naive Bayes fit and predict nb.fit(X, y) nb_pred = nb.predict(X) # use predictions from input models as inputs for meta-classifiers meta_input = np.hstack((rf_pred.reshape( (rf_pred.size, 1)), knn_pred.reshape( (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1)))) # use Decision Tree as meta-classifier dt = DecisionTree(max_depth=np.inf) dt.fit(meta_input, y) self.rf = rf self.knn = knn self.nb = nb self.meta_classifier = dt
def run_forest(n_trees=400, n_features_per_tree=70, n_rows_power=0.5): start_time = time.time() forest = RandomForest(trainset, labels, n_trees, n_features_per_tree, n_rows_power) forest.train() score = [] for test, (i, bm) in zip(testset, benchmarkset): result = forest.classify(test) result = result.most_common(1)[0][0] print('{:.1f}s -- No.{} should be: <{}> cal: {}'.format(time.time()-start_time, i, bm, result)) if result == bm: score.append('+') else: score.append('-') with open('result_by_self_train[28000].csv', 'a') as f: f.write('{},{},<{}>\n'.format(i, result, bm)) # print('classify done {}, {:.1%}'.format(Counter(score), Counter(score)['+'] / len(score))) return Counter(score)['+'] / len(score)
def fit_emotion(self, emotion_number, X, y): binary_y = binarize_y(y, emotion_number) tree = Tree() if self.random_forest: tree = RandomForest(num_of_trees=self.num_of_trees) tree.fit(self.predictors, X, binary_y) return tree
def init_model(model_type, delta, area_width): if model_type == 'LR': return LogisticRegression(delta, area_width) elif model_type == 'DT': return DecisionTree(delta) elif model_type == 'RF': return RandomForest(delta) else: raise SolverException('Invalid model type: ' + Fore.MAGENTA + model_type + Fore.RESET)
def run_forest(n_trees=400, n_features_per_tree=70, n_rows_power=0.5): start_time = time.time() forest = RandomForest(trainset, labels, n_trees, n_features_per_tree, n_rows_power) forest.train() score = [] for test, (i, bm) in zip(testset, benchmarkset): result = forest.classify(test) result = result.most_common(1)[0][0] print('{:.1f}s -- No.{} should be: <{}> cal: {}'.format( time.time() - start_time, i, bm, result)) if result == bm: score.append('+') else: score.append('-') with open('result_by_self_train[28000].csv', 'a') as f: f.write('{},{},<{}>\n'.format(i, result, bm)) # print('classify done {}, {:.1%}'.format(Counter(score), Counter(score)['+'] / len(score))) return Counter(score)['+'] / len(score)
def fit(self, X, y): self.rf = RandomForest(num_trees=15, max_depth=np.inf) self.rf.fit(X, y) y_rf = self.rf.predict(X) self.nb = NaiveBayes() self.nb.fit(X, y) y_nb = self.nb.predict(X) self.knn = KNN(k=3) self.knn.fit(X, y) y_knn = self.knn.predict(X) newX = np.array([y_rf, y_nb, y_knn]).transpose() model = DecisionTree(max_depth=np.inf, stump_class=DecisionStumpErrorRate) self.model = model model.fit(newX, y)
def run(self): logger.info('Select tickers') tickers = self.choose_ticker.random_tickers(self.number_of_tickers) logger.info(f'The selected tickers are {tickers}') for ticker in tickers: # for ticker in ['MCHP']: logger.info(f'Starting with ticker {ticker}') data = self.data_loader.load(ticker) self.graph_maker.plot_adjusted_prices(ticker, data) data = self.technical_analysis.calculate(data) logger.info(f'Technical analysis graphs') train, test, validation, train_graph, test_graph, validation_graph = self.data_loader.transform( data, number_of_past_points=self.number_of_past_points) self.graph_maker.plot_train_test_val( ticker, train_graph, test_graph, validation_graph) for position, model_name in enumerate( [element.get('model') for element in self.models_and_parameters]): if model_name == 'feed_forward_neural_net': model = FeedForwardNN( dimension_of_first_layer=self.number_of_past_points * train[0][0].shape[1], ticker=ticker, overfitting_threshold=self.overfitting_threshold, ) if model_name == 'random_forest': model = RandomForest( ticker=ticker, overfitting_threshold=self.overfitting_threshold) if model_name == 'xgboost': model = XGBoost( ticker=ticker, overfitting_threshold=self.overfitting_threshold) if model_name == 'Arima': model = Arima( ticker=ticker, overfitting_threshold=self.overfitting_threshold) if model_name == 'regressors': model = Regressors( ticker=ticker, overfitting_threshold=self.overfitting_threshold) best_parameters, mse, trend_ratio, prediction, true_values, there_is_a_best_prediction = model.run( train=train[::-1], val=validation[::-1], test=test[::-1], model_parameters=self.models_and_parameters[position]) logger.info( f'The best scenario for a {model_name} is {best_parameters}, mse: {mse},' f' ratio of trend {trend_ratio*100}') if there_is_a_best_prediction: self.graph_maker.plot_test_results(true_values, prediction, ticker, mse, model_name) else: logger.info( f'No model could be fitted for {model_name} due to the ' f'overfitting threshold of {self.overfitting_threshold}')
def test_random_forest(): goal_attr = 'play' attr = 'wind' attr_universe = ['strong', 'weak'] attr_2 = 'wheather' attr_2_univserse = ['sunny', 'cloudy', 'rainny'] attr_3 = 'temperature' attr_3_universe = ['cold', 'norm', 'hot'] attr_4 = 'humidity' attr_4_universe = ['norm', 'high'] df = { goal_attr: [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1], attr: ['weak', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak', 'weak', 'weak', 'strong', 'strong', 'weak'], attr_2: ['sunny', 'sunny', 'cloudy', 'rainny', 'rainny', 'rainny', 'cloudy', 'sunny', 'sunny', 'rainny', 'sunny', 'cloudy', 'cloudy'], attr_3: ['hot', 'hot', 'hot', 'norm', 'cold', 'cold', 'cold', 'norm', 'cold', 'norm', 'norm', 'norm', 'hot'], attr_4: ['high', 'high', 'high', 'high', 'norm', 'norm', 'norm', 'high', 'norm', 'norm', 'norm', 'high', 'norm'] } df = pd.DataFrame(df) forest = RandomForest(shanon_gain) forest.train(10, 1, 4, df, {goal_attr: [0, 1]}, { attr: attr_universe, attr_2: attr_2_univserse, attr_3: attr_3_universe, attr_4: attr_4_universe}) case = { attr: 'strong', attr_2: 'rainny', attr_3: 'norm', attr_4: 'high' } result = forest.predict(case) expected = 0 assert result == expected
class TestModelMethods(unittest.TestCase): ''' Test the methods of the random forest model wrapper. Arguments: unittest -- Python Unit Test Framework ''' def setUp(self): ''' Test runner setup. ''' self.rf = RandomForest() self.rf.model_file = 'testModel.pth' def testDataVariables(self): ''' Test whether feature and target variables are defined in the model. ''' self.assertGreater(len(self.rf.features), 0) self.assertGreater(len(self.rf.target), 0) def testDataLoading(self): ''' Test the data loading. ''' for dataset in ['train', 'val', 'test', 'full']: self.assertGreater(len(self.rf.data[dataset]), 0) self.assertGreater(len(self.rf.samples[dataset]), 0) self.assertGreater(len(self.rf.labels[dataset]), 0) def testSaveExistingModel(self): ''' Test the save method for a blank model. ''' self.rf.model = RandomForestRegressor() self.assertTrue(self.rf._saveModel()) def testSaveNonExistingModel(self): ''' Test the save method for a non existing model. ''' self.rf.model = None self.assertFalse(self.rf._saveModel()) def testLoadExisitngModel(self): ''' Test the load model for a previous saved model. ''' self.rf._saveModel() self.assertTrue(self.rf._loadModel()) def testLoadNonExisitngModel(self): ''' Test the load method for a non-existing model path. ''' self.rf.model_file = 'non-existing/path.x' self.assertFalse(self.rf._loadModel())
def testRF(): train_x, train_y, dev_x, dev_y, test_x, test_y, = util.loadTreeData() RF = RandomForest(min_leaf_size=50, max_depth=10, num_trees=10) RF.fit(train_x, train_y) preds = RF.predict(train_x) train_rf_rmse = util.findRMSE(preds, train_y) preds = RF.predict(test_x) test_rf_rmse = util.findRMSE(preds, test_y) print('RF RMSE: \t', test_rf_rmse) return train_rf_rmse, test_rf_rmse
def main(): parser = argparse.ArgumentParser(description='Random Forest parser') parser.add_argument('--opt', help='test-benchmark or test-dataset.', required=True) parser.add_argument('--dataset', help='The dataset filename.', default='', required=False) parser.add_argument('--target_attribute', help='Target attribute to be predicted.', default='', required=False) parser.add_argument('--n_trees', help='The number of trees. The default is 5.', default=5, type=int, required=False) parser.add_argument('--n_attributes', help='The number of attributes. The default is the squared root of otal attributes.', default=-1, type=int, required=False) parser.add_argument('--k_folds', help='The number of folds for cross validation. The default is 5', default=5, type=int, required=False) parser.add_argument('--r', help='The number of repetitions for repeated cross validation. The default is 1', default=1, type=int, required=False) args = parser.parse_args() if args.opt == 'test-benchmark': test_benchmark_categorical() test_benchmark_numerical() if args.opt == 'test-dataset': if args.dataset == '' or not os.path.isfile(DATA_PATH + args.dataset): print('Dataset not found.') return try: with open(DATA_PATH + args.dataset[:-3] + 'json', 'r') as filetypes: types = json.load(filetypes) except: print('Dataset types not found, automatic types will be used.') types = {} data = pd.read_csv( DATA_PATH + args.dataset, delimiter='\t' if args.dataset[-3:] == 'tsv' else ',', dtype=types ) if args.target_attribute not in data.columns: print("Target attribute doesn't exist on dataset.") return n_trees = args.n_trees n_random_attributes = args.n_attributes if n_random_attributes == -1: n_random_attributes = int((len(data.columns) - 1) ** 1/2) cv = CrossValidator( RandomForest(n_trees, args.target_attribute, n_random_attributes) ) cv.cross_validate(data, args.k_folds, args.r) print('\nGlobal accuracy: %.3f (%.3f)' % (cv.accuracy, cv.accuracy_std))
def construct_RF(self): full_dataset = self.df difficult_area = self.create_difficult_area() critical_dataset = self.df.iloc[difficult_area] rf1 = RandomForest(full_dataset, 5, self.s*(1-self.p)) trees_1 = rf1.construct_trees() rf2 = RandomForest(critical_dataset, 5, self.s*self.p) trees_2 = rf2.construct_trees() trees = trees_1 + trees_2 return trees
def main(): X = list() y = list() XX = list() # Contains data features and data labels numerical_cols = set([0,10,11,12,13,15,16,17,18,19,20]) # indices of numeric attributes (columns) # Loading data set print("reading input data") with open("data.csv") as f: next(f, None) for line in csv.reader(f, delimiter=","): xline = [] for i in range(len(line)): if i in numerical_cols: xline.append(ast.literal_eval(line[i])) else: xline.append(line[i]) X.append(xline[:-1]) y.append(xline[-1]) XX.append(xline[:]) # VERY IMPORTANT: Minimum forest_size should be 10 forest_size = 10 # Initializing a random forest. randomForest = RandomForest(forest_size) # Creating the bootstrapping datasets print("creating the bootstrap datasets") randomForest.bootstrapping(XX) # Building trees in the forest print("fitting the forest") randomForest.fitting() # Calculating an unbiased error estimation of the random forest # based on out-of-bag (OOB) error estimate. y_predicted = randomForest.voting(X) # Comparing predicted and true labels results = [prediction == truth for prediction, truth in zip(y_predicted, y)] # Accuracy accuracy = float(results.count(True)) / float(len(results)) print("accuracy: %.4f" % accuracy) print("OOB estimate: %.4f" % (1-accuracy))
def train_model(model_name, X_train, Y_train, X_val, Y_val): """ Trains a supervised classifier using the training data provided, and scores it using the validation dataset. Param: - model_name: a string containing a model type - Train data: - X_train - Y_train - Validation data: - X_val - Y_val Return: - model: a supervised classifier, to be used for testing """ if model_name == 'svm': model = SVM() elif model_name == "random_forest": max_depth = 2 model = RandomForest(max_depth) elif model_name == "neural_network": max_depth = 2 out_size = 2 hidden_size = 30 in_size = X_train.shape[1] model = NeuralNetwork(in_size, hidden_size, out_size) elif model_name == "knn": n_neighbors = 50 model = KNearestNeighbors(n_neighbors) else: return "Error: Model not yet implemented..." print("Training " + model_name + "...") train_score = model.train(X_train, Y_train) valid_score = model.score(X_val, Y_val) print("Training Accuracy: %s" % train_score) print("Validation Accuracy: %s" % valid_score) return model
booster.store_classification_result(out_file) print 'Extreme Gradient Boosting Completed. Labels saved. Moving Forward' ############## PART 3 ################# # 1. Xgboost - Gradient Boosting Machine print 'Beginning Extreme Gradient Boosting...' out_file = './data_final/part_3/xgboost.csv' '''Same Code as above''' booster.store_classification_result(out_file) print 'Extreme Gradient Boosting Completed. Labels saved. Moving Forward' # 2. Random Forest - Entropy Index print 'Beginning Random Forest Classfication (ENTROPY BASED) ...' out_file = './data_final/part_3/rf_entropy.csv' Kfold = 10 forest = RandomForest(Kfold,dataDir,'entropy') train_acc = forest.get_train_accuracy() print "Train Accuracy Random Forest (Entropy):%f \n" %(train_acc) forest.store_classification_result(out_file) print 'Random Forest Classfication Completed. Labels saved. Moving Forward.' # 3. Random Forest - Gini Index print 'Beginning Random Forest Classfication (GINI BASED) ...' out_file = './data_final/part_3/rf_gini.csv' Kfold = 10 forest = RandomForest(Kfold,dataDir,'gini') train_acc = forest.get_train_accuracy() print "Train Accuracy Random Forest (Gini) :%f \n" %(train_acc) forest.store_classification_result(out_file) print 'Random Forest Classfication Completed. Labels saved. All done.'
def apply_random_forest(filename): rf = RandomForest(filename, estimators=500) #rf.split_data() rf.start_data() rf.train_random_forest() rf.test_random_forest() rf.switch_folds() rf.train_random_forest() rf.test_random_forest()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # Rescale label for Adaboost to {-1, 1} rescaled_y_train = 2*y_train - np.ones(np.shape(y_train)) rescaled_y_test = 2*y_test - np.ones(np.shape(y_test)) # ....... # SETUP # ....... adaboost = Adaboost(n_clf = 8) naive_bayes = NaiveBayes() knn = KNN(k=4) logistic_regression = LogisticRegression() mlp = MultilayerPerceptron(n_hidden=20) perceptron = Perceptron() decision_tree = DecisionTree() random_forest = RandomForest(n_estimators=150) support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel) # ........ # TRAIN # ........ print "Training:" print "\tAdaboost" adaboost.fit(X_train, rescaled_y_train) print "\tNaive Bayes" naive_bayes.fit(X_train, y_train) print "\tLogistic Regression" logistic_regression.fit(X_train, y_train) print "\tMultilayer Perceptron" mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1) print "\tPerceptron"
def main(): ##### Part 1: Vanilla Logistic Regression ##### Private Score : 50% # USE SHOBHIT's CODE ##### Part 2: XGBoost ''' Kfold = 5 dataDir = "data" booster = Xgboost(Kfold, dataDir) train_acc = booster.get_train_accuracy() print "Train Accuracy :%f \n" %(train_acc) booster.store_classification_result() ''' ##### Part 3: Ensembler print 'Beginning Ensembling....' train_file = 'data/train.csv' test_file = 'data/test.csv' dataDir = 'data' # 1. Logistic Regressor - balanced - Sparse Greedy Features print 'Beginning Logistic Regression ...' stupid_train_out_file = './data/results/lr_unbalanced_sparse_train.csv' out_file = './data/ensemble_results/lr_unbalanced_sparse.csv' clf = LogRes(train_file, test_file, stupid_train_out_file, out_file) clf.feature_selection(64, [3]) print 'Logstic Regression Completed. Labels saved. Moving Forward.' # 2. Random Forest - Entropy Index print 'Beginning Random Forest Classfication (ENTROPY BASED) ...' out_file = './data/ensemble_results/rf_entropy.csv' Kfold = 10 forest = RandomForest(Kfold,dataDir,'entropy') train_acc = forest.get_train_accuracy() print "Train Accuracy Random Forest (Entropy):%f \n" %(train_acc) forest.store_classification_result(out_file) print 'Random Forest Classfication Completed. Labels saved. Moving Forward.' ''' # 3. Random Forest - Gini Index print 'Beginning Random Forest Classfication (GINI BASED) ...' out_file = './data/ensemble_results/rf_gini.csv' Kfold = 10 forest = RandomForest(Kfold,dataDir,'gini') train_acc = forest.get_train_accuracy() print "Train Accuracy Random Forest (Gini) :%f \n" %(train_acc) forest.store_classification_result(out_file) print 'Random Forest Classfication Completed. Labels saved. Moving Forward.' # 4. Extra Trees - Entropy Index print 'Beginning Extremely Randomized RF i.e Extra Trees (ENTROPY BASED)...' out_file = './data/ensemble_results/extra_forest_entropy.csv' Kfold = 10 trees = ExtraTrees(Kfold,dataDir,'entropy') train_acc = trees.get_train_accuracy() print "Train Accuracy Extra Trees (Entropy):%f \n" %(train_acc) trees.store_classification_result(out_file) print 'Extra Trees (ENTROPY) Completed. Labels saved. Moving Forward.' # 5. Extra Trees - Gini Index print 'Beginning Extremely Randomized RF i.e Extra Trees (GINI BASED)...' out_file = './data/ensemble_results/extra_forest_gini.csv' Kfold = 10 trees = ExtraTrees(Kfold,dataDir,'gini') train_acc = trees.get_train_accuracy() print "Train Accuracy Extra Trees (Gini):%f \n" %(train_acc) trees.store_classification_result(out_file) print 'Extra Trees (GINI) Completed. Labels saved. Moving Forward.' ''' # 6. Xgboost - Gradient Boosting Machine print 'Beginning Extreme Gradient Boosting...' out_file = './data/ensemble_results/xgboost.csv' Kfold = 7 booster = Xgboost(Kfold, dataDir) train_acc = booster.get_train_accuracy() print "Train Accuracy :%f \n" %(train_acc) booster.store_classification_result(out_file) print 'Extreme Gradient Boosting Completed. Labels saved' print 'All models trained and test labels saved' print 'All done... Exiting...' print 'Exited'
# Load data cur_states, actions, rewards, next_states, users, action_index, user_index, valid_feats = load_data(feat_path, target_action, num_users=num_users, num_users_ratio=num_users_ratio, demography=demography) print cur_states.shape, next_states.shape, actions.shape, rewards.shape, users.shape, len(action_index), len(user_index), sum(valid_feats) # dim: cur_states,next_states = num_instances x num_features # dim: actions,rewards = num_instances num_feats = cur_states.shape[1] num_actions = len(action_index) action_list = [ 0 for x in range(num_actions) ] for a,i in action_index.iteritems(): action_list[i] = a s0 = np.zeros((1,num_feats)) approximator = RandomForest(num_estimators=num_estimators, num_actions=num_actions) approximator.train(cur_states, actions, rewards) for iter in range(num_iters): print "---------------------------------------------------------------\nIteration", iter start_time = time.time() qs = approximator.predict(next_states) # dim: num_actions x num_instances max_qs = np.amax(qs, axis=0) # dim: num_instances max_qs[actions==action_index[target_action]] = 0 approximator.train(cur_states, actions, rewards + discount * max_qs) if debug_action_cnt: max_as = np.argmax(qs, axis=0) action_cnt = defaultdict(int) for a in max_as: action_cnt[action_list[a]] += 1 print "action count: ", action_cnt
Copyright Brian Dolhansky 2014 [email protected] """ from random_forest import RandomForest from sklearn.datasets import fetch_mldata from data_utils import integral_to_indicator, split_train_test import numpy as np print "Loading data..." mnist = fetch_mldata('MNIST original', data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test(mnist.data, mnist.target) train_target = integral_to_indicator(train_target) test_target_integral = integral_to_indicator(test_target) print "Done!" np.seterr(all='ignore') print "Training random forest..." rf = RandomForest(21, 10, 10, boot_percent=0.3, feat_percent=0.1, debug=False) rf.train(train_data, train_target) print "Done training!" print "Testing..." yhat = rf.test(test_data, test_target_integral) err = (np.sum(yhat != test_target[:, None]).astype(float))/test_target.shape[0] print "Error rate: {0}".format(err) print "Done!"