def main(cl_args=sys.argv[1:]): """Main wrapper to run the classification app.""" args = parse_command_line_args(cl_args=cl_args) datafile = os.path.realpath(args["data_path"]) testfile = os.path.realpath(args["test_path"]) output_root = os.path.realpath(args["output_path"]) random_seed = args["random_seed"] info_level = "INFO" if args["verbose"]: info_level = "DEBUG" # Configure the logger. logging.basicConfig(format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', level=info_level) num_trees = args["num_trees"] if not os.path.exists(output_root): os.makedirs(output_root) # annotated training data data = fileio(datafile) # annotated test data testdata=fileio(testfile) # Invoking unit test for decision tree if args["run_tests"]: logger.info("Invoking test case for decision trees") suite = unittest.TestLoader().loadTestsFromTestCase(TestDecisionTree) unittest.TextTestRunner().run(suite) sys.exit() # Creating a set of test points from test data test_points=[] for line in testdata: test_points.append(line[0:len(line)-1]) # Single decision tree for entire credit approval dataset tree_credit = DecisionTree() logger.info('Commencing single Decision Tree for credit approval data') start_time = time.time() tree_credit.build_tree(data) end_time = time.time() logger.info('time_lapsed: {:0.4f}'.format(end_time - start_time)) tree_credit.drawtree(jpeg=os.path.join(output_root, 'singletree.png')) # Random forest of decision trees for the credit approval dataset logger.info('Commencing single Random Forest for credit approval data') forest = RandomForest(num_trees) start_time = time.time() forest.build_forest(data, output_path=output_root, seed=random_seed) end_time = time.time() logger.info('Time to build forest: {:0.4f}'.format(end_time - start_time)) evaluate(test_points, forest, testdata)
def main(): """ N.B. Last DataFrame Column contains labels """ logger = logging.getLogger(__name__) logger.debug('read data') dframe_train = pd.read_excel(os.path.join(input_filepath, "train_data.xlsx"), index_col=0) logger.debug('train model') '''CREATE SINGLE TREE''' d_t = DecisionTree(metrics = 'entropy') #max_depth = 8 #trained_dt = dt.build_tree(dframe,header) #prediction = classify(small_train.values[0][:-1],t0) '''CREATE RANDOM FOREST WITH TREES d_t''' r_f = RandomForest(decision_tree_type=d_t, n_trees=20) r_f = r_f.build_forest(dframe_train, n_selected_features="best", sample_ratio =.8) '''GET MODEL ACCURACY ON VALIDATION DATA''' logger.debug('get model accuracy') dframe_val = pd.read_excel(os.path.join(input_filepath, "validate_data.xlsx"), index_col=0) predictions_validation = r_f.get_model_accuracy(dframe_val.columns.values.tolist(), dframe_val) #logger.debug('single prediction') #rf.classify_forest(dframe_val.columns.values.tolist(),dframe_val.values[0],forest) logger.debug('save model') save_model(output_filepath, "model_00.npy", r_f)