Beispiel #1
0
def main(cl_args=sys.argv[1:]):
    """Main wrapper to run the classification app."""
    args = parse_command_line_args(cl_args=cl_args)
    datafile = os.path.realpath(args["data_path"])
    testfile = os.path.realpath(args["test_path"])
    output_root = os.path.realpath(args["output_path"])
    random_seed = args["random_seed"]
    info_level = "INFO"
    if args["verbose"]:
        info_level = "DEBUG"
    # Configure the logger.
    logging.basicConfig(format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
                        level=info_level)
    num_trees = args["num_trees"]
    if not os.path.exists(output_root):
        os.makedirs(output_root)

    # annotated training data
    data = fileio(datafile)
    # annotated test data
    testdata=fileio(testfile)

    # Invoking unit test for decision tree
    if args["run_tests"]:
        logger.info("Invoking test case for decision trees")
        suite = unittest.TestLoader().loadTestsFromTestCase(TestDecisionTree)
        unittest.TextTestRunner().run(suite)
        sys.exit()

    # Creating a set of test points from test data
    test_points=[]
    for line in testdata:
        test_points.append(line[0:len(line)-1])

    # Single decision tree for entire credit approval dataset
    tree_credit = DecisionTree()
    logger.info('Commencing single Decision Tree for credit approval data')
    start_time = time.time()
    tree_credit.build_tree(data)
    end_time = time.time()
    logger.info('time_lapsed: {:0.4f}'.format(end_time - start_time))
    tree_credit.drawtree(jpeg=os.path.join(output_root, 'singletree.png'))

    # Random forest of decision trees for the credit approval dataset
    logger.info('Commencing single Random Forest for credit approval data')
    forest = RandomForest(num_trees)
    start_time = time.time()
    forest.build_forest(data, output_path=output_root, seed=random_seed)
    end_time = time.time()
    logger.info('Time to build forest: {:0.4f}'.format(end_time - start_time))

    evaluate(test_points, forest, testdata)
Beispiel #2
0
def main():
    """
    N.B. Last DataFrame Column contains labels
    """
    logger = logging.getLogger(__name__)    
    logger.debug('read data')
     
    dframe_train = pd.read_excel(os.path.join(input_filepath, "train_data.xlsx"), index_col=0)
    logger.debug('train model')
    
    '''CREATE SINGLE TREE'''
    d_t = DecisionTree(metrics = 'entropy') #max_depth = 8
    #trained_dt = dt.build_tree(dframe,header)
    #prediction = classify(small_train.values[0][:-1],t0)
    
    '''CREATE RANDOM FOREST WITH TREES d_t'''
    r_f = RandomForest(decision_tree_type=d_t, n_trees=20)
    r_f = r_f.build_forest(dframe_train, n_selected_features="best", sample_ratio =.8)
    
    '''GET MODEL ACCURACY ON VALIDATION DATA'''
    logger.debug('get model accuracy')
    dframe_val = pd.read_excel(os.path.join(input_filepath, "validate_data.xlsx"), index_col=0)
    predictions_validation = r_f.get_model_accuracy(dframe_val.columns.values.tolist(), dframe_val)  
    
    #logger.debug('single prediction')
    #rf.classify_forest(dframe_val.columns.values.tolist(),dframe_val.values[0],forest) 
    
    logger.debug('save model')
    save_model(output_filepath, "model_00.npy", r_f)