def recursive_tree_exploration(feature_set, past_accuracy, past_model): last_added = feature_set[len(feature_set) - 1] new_feature_set = copy.copy(feature_set) #POR QUe AGREGAR 0 ?? ## COUNT 1 APPEND new_feature_set.append(0) max_accuracy = past_accuracy best_set = feature_set best_model = past_model for iter in xrange(last_added + 1, feature_amount): ## COUNT 1 POP new_feature_set.pop() ## COUNT 1 APPEND new_feature_set.append(iter) new_x_norm, y, _, _ = getDataSubSet(new_feature_set) model, calc_best_model, calc_accuracy = manual_cross_validation( new_x_norm, y, models, names, True) if calc_accuracy > max_accuracy: max_accuracy = calc_accuracy best_set = new_feature_set best_model = calc_best_model calc_accuracy, calc_best_set, calc_best_model = recursive_tree_exploration( new_feature_set, max_accuracy, calc_best_model) if calc_accuracy > max_accuracy: max_accuracy = calc_accuracy best_set = calc_best_set best_model = calc_best_model print best_set print "BEST SET ACCURACY: " + str(max_accuracy) print "BEST SET MODEL: " + str(best_model) return max_accuracy, best_set, best_model
def tree_selection_heuristic(): # The feature set that is going to be evaluated feature_set = [0] max_accuracy = 0.0 best_set = [] best_model = None # For every set of 1 feature the recursive search is applied for i in xrange(1, feature_amount): feature_set.pop() feature_set.append(i) new_x_norm, y, _, _ = getDataSubSet(feature_set) #INIT FOR RECURSIVE CALL model, calc_best_model, calc_accuracy = manual_cross_validation( new_x_norm, y, models, names, True) #RECURSIVE CALL calc_accuracy, calc_best_set, calc_best_model = recursive_tree_exploration( feature_set, calc_accuracy, calc_best_model) if calc_accuracy > max_accuracy: max_accuracy = calc_accuracy best_set = calc_best_set best_model = calc_best_model print "-----------------------------------" print "---------BEST FEATURE SET----------" print best_set print "-----------------------------------" print "-------------ACCURACY--------------" print max_accuracy return best_model
size = pow(2,n) # Vars to select the best suited model bestModel = None bestName = "none" bestMean = 0.0 bestSet = [] for ii in xrange(size): subset = [] for jj in xrange(n): if (ii & (1 << jj)) > 0: subset.append(set[jj]) print subset if (len(subset)>0): try: X,y,X_test,Y_test= getDataSubSet(subset) scaler = preprocessing.MinMaxScaler() scaler.fit(X) x_norm=scaler.transform(X) #initialize models forest= RandomForestClassifier(n_estimators=100, max_depth=20, random_state=111) gdBoost = GradientBoostingClassifier(random_state=111) mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=111) models=[forest,gdBoost,mlp] names= ["Random Forest", "Gradient Boosting", "MuliLayer Perceptrons"] ##Using all the features
import json from sklearn import metrics as m from sklearn import preprocessing from sklearn.ensemble import GradientBoostingClassifier from sklearn.externals import joblib from sklearn.feature_selection import RFECV from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import chi2 from GetDataSet import getDataSubSet from Validation import manual_cross_validation X, y, X_test, Y_test = getDataSubSet( [3, 5, 34, 35, 38, 39, 40, 42, 51, 52, 56, 60, 62, 99]) ## Count number of 'easy' labeled instances and total instances # This is done to keep control of the correct distribution of the dataset and the parameters of the experiment. easyCount = 0 totalCount = 0 for i in xrange(len(Y_test)): if Y_test[i] == "Easy": easyCount += 1 totalCount += 1 print("Ratio of Easy over all on testing set: %0.2f" % ((easyCount + 0.0) / len(Y_test))) easyCount = 0 for i in xrange(len(y)): if y[i] == "Easy": easyCount += 1
from imblearn.over_sampling import SMOTE from GetDataSet import getDataSubSet #set that defines the different sets of features that will be used #features = [1,2,3,4,5,6,7,44]+[x for x in xrange(8,15)]+[31,32,33,34,45,50]+[c for c in xrange(35,44)]+[56,57,58,60,65,67,68,84] #features =[3, 5, 34, 35, 38, 39, 40, 42, 51, 52, 56, 60, 62, 99] features = [x for x in range(100, 300)] print features # Vars to select the best suited model bestModel = None bestName = "none" bestMean = 0.0 bestSet = [] X, y, X_test, Y_test = getDataSubSet(features) scaler = preprocessing.MinMaxScaler() scaler.fit(X) x_norm = scaler.transform(X) #initialize models #forest= RandomForestClassifier(n_estimators=9100, max_depth=300, random_state=111) gdBoost = GradientBoostingClassifier(random_state=111) # mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 2), random_state=111) # models=[forest,gdBoost,mlp] # names= ["Random Forest", "Gradient Boosting", "MuliLayer Perceptrons"] # # # # ##Using all the features # model,name,mean=manual_cross_validation(x_norm, y, models, names, True)