def runRandomForest(trainDataParam, testDataParam): res_combo_dict ={} #n_estimators_list=[500] n_estimators_list = [75, 80, 85] criterion_list = ['gini', 'entropy'] #max_features_list = ['auto', 'sqrt', 'log2', None] max_depth_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, None] max_leaf_nodes_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, None] bootstrap_list = [True, False] #min_samples_split_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100] #oob_score_list = [True, False] min_weight_fraction_leaf_list = [0.1, 0.2, 0.3, 0.4, 0.5] # cannot be more than 0.50 ### setting the aprameters : test purpose # n_estimators_list=[50, 50000] # criterion_list = ['gini', 'entropy'] # max_features_list=['auto', None] # max_depth_list = [1, 1000 ] # max_leaf_nodes_list = [None, 5, 1000] # in our datset only 549 legit samples so should eb limited to 549 # bootstrap_list=[True, False] # min_samples_split_list = [1, 1000] # in our datset only 549 legit samples so should eb limited to 549 # oob_score_list=[True, False] # min_weight_fraction_leaf_list=[0.0, 0.5] # must be between 0.0 and 0.50 # warm_start_list=[True, False] ### for eti in n_estimators_list: for crit in criterion_list: for max_depth_ in max_depth_list: for max_leaf in max_leaf_nodes_list: for bootstrap_ in bootstrap_list: for mwfratleaf in min_weight_fraction_leaf_list: ## display params: # n_jobs has been set to -1 to use all the cores avialable , not part fo an experiemnt print "##########" print "n_estimators={}, criterion={}, max_dept={}, max_leaf_nodes={}".format(eti, crit, max_depth_, max_leaf ) print "bootstrap={}, min-wt-frac={}".format(bootstrap_, mwfratleaf ) key_str_1 = str(eti) + "_" + crit + "_" + str(max_depth_) + "_" + str(max_leaf) + "_" key_str_2 = str(bootstrap_) + "_" + str(mwfratleaf) + "_" key_for_dict = key_str_1 + key_str_2 ## fire up the model with IO_.duration(): theRndForestModel = RandomForestClassifier( n_estimators=eti, criterion=crit, max_depth=max_depth_, min_weight_fraction_leaf=mwfratleaf, max_leaf_nodes=max_leaf, bootstrap=bootstrap_ ) res_tuple = perform_cross_validation(theRndForestModel, trainDataParam, testDataParam, 10) res_combo_dict[key_for_dict] = res_tuple print "##########" return res_combo_dict
max_leaf_nodes=10000) mae_for_param_combo_2 = perform_cross_validation( the_Model_2, trainingData, testData, cv_param)[1] t2 = time.time() time_for_param_comb_2 = t2 - t1 mae_list_2.append(mae_for_param_combo_2) time_list_2.append(time_for_param_comb_2) mae_a12_ = a12_utility.doSlowA12(mae_list_1, mae_list_2) time_a12_ = a12_utility.doSlowA12(time_list_2, time_list_1) print "MAE comaprison: is default worse than 'best combo' ?", mae_a12_ print "time comaprison: is 'best' combo slower than default ?", time_a12_ datasetFileName = "13_NonZeroDataset_Aggolo.csv" iterations = 10000 cv_param = 5 print "========== Random Forest ==========" with IO_.duration(): runRFTest(datasetFileName, iterations, cv_param) print "========== KNN ==========" with IO_.duration(): runknnTest(datasetFileName, iterations, cv_param) print "========== SVM ==========" with IO_.duration(): runsvmTest(datasetFileName, iterations, cv_param) print "========== CART ==========" with IO_.duration(): runCARTTest(datasetFileName, iterations, cv_param)
mae_for_param_combo_2 = perform_cross_validation(the_Model_2, trainingData, testData, cv_param)[1] t2 = time.time() time_for_param_comb_2 = t2 - t1 mae_list_2.append(mae_for_param_combo_2) time_list_2.append(time_for_param_comb_2) mae_a12_ = a12_utility.doSlowA12(mae_list_1, mae_list_2) time_a12_ = a12_utility.doSlowA12(time_list_2, time_list_1) print "MAE comaprison: is default worse than 'best combo' ?", mae_a12_ print "time comaprison: is 'best' combo slower than default ?", time_a12_ datasetFileName="13_NonZeroDataset_Aggolo.csv" iterations=10000 cv_param = 5 print "========== Random Forest ==========" with IO_.duration(): runRFTest(datasetFileName, iterations, cv_param) print "========== KNN ==========" with IO_.duration(): runknnTest(datasetFileName, iterations, cv_param) print "========== SVM ==========" with IO_.duration(): runsvmTest(datasetFileName, iterations, cv_param) print "========== CART ==========" with IO_.duration(): runCARTTest(datasetFileName, iterations, cv_param)
def runRandomForest(trainDataParam, testDataParam): res_combo_dict ={} # ### setting the aprameters n_estimators_list=[500] #n_estimators_list=[10, 50, 100, 500] criterion_list = ['gini', 'entropy'] max_features_list=['auto', 'sqrt', 'log2', None] max_depth_list = [5, 15, 50, None ] max_leaf_nodes_list = [None, 25, 50, 75] # in our datset only 549 legit samples so should eb limited to 549 bootstrap_list=[True, False] min_samples_split_list = [1, 25, 50, 100] # in our datset only 549 legit samples so should eb limited to 549 oob_score_list=[True, False] min_weight_fraction_leaf_list=[0.0, 0.2, 0.3, 0.4] # must be between 0.0 and 0.50 warm_start_list=[True, False] # ### ### setting the aprameters : test purpose # n_estimators_list=[50, 50000] # criterion_list = ['gini', 'entropy'] # max_features_list=['auto', None] # max_depth_list = [1, 1000 ] # max_leaf_nodes_list = [None, 5, 1000] # in our datset only 549 legit samples so should eb limited to 549 # bootstrap_list=[True, False] # min_samples_split_list = [1, 1000] # in our datset only 549 legit samples so should eb limited to 549 # oob_score_list=[True, False] # min_weight_fraction_leaf_list=[0.0, 0.5] # must be between 0.0 and 0.50 # warm_start_list=[True, False] ### for eti in n_estimators_list: for crit in criterion_list: for maxfeat in max_features_list: for max_depth_ in max_depth_list: for max_leaf in max_leaf_nodes_list: for bootstrap_ in bootstrap_list: for min_sample in min_samples_split_list: if bootstrap_==False: oob_score_list=[False, False] for oob_ in oob_score_list: for mwfratleaf in min_weight_fraction_leaf_list: for warm_start_ in warm_start_list: ## display params: # n_jobs has been set to -1 to use all the cores avialable , not part fo an experiemnt print "##########" print "n_estimators={}, criterion={}, max_features={}, max_dept={}, max_leaf_nodes={}".format(eti, crit, maxfeat, max_depth_, max_leaf ) print "bootstrap={}, min-sample-split={}, oob_score={}, min-wt-frac={}, warm-start={}".format(bootstrap_, min_sample, oob_, mwfratleaf, warm_start_ ) key_str_1 = str(eti) + "_" + crit + "_" + str(maxfeat) + "_" + str(max_depth_) + "_" + str(max_leaf) + "_" key_str_2 = str(bootstrap_) + "_" + str(min_sample) + "_" + str(oob_) + "_" + str(mwfratleaf) + "_" +str(warm_start_) key_for_dict = key_str_1 + key_str_2 ## fire up the model with IO_.duration(): theRndForestModel = RandomForestClassifier( n_estimators=eti, criterion=crit, max_depth=max_depth_, min_samples_split=min_sample, max_features=maxfeat, min_weight_fraction_leaf=mwfratleaf, max_leaf_nodes=max_leaf, bootstrap=bootstrap_, oob_score=oob_, n_jobs=-1 , warm_start=warm_start_ ) res_tuple = perform_cross_validation(theRndForestModel, trainDataParam, testDataParam, 2) res_combo_dict[key_for_dict] = res_tuple print "##########" return res_combo_dict