def genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR=0.95, l = 1): clf = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4) clf.fit(X, labels) random_trees = clf.estimators_ trees = [] best_loss = np.inf best_tree = None for tree in random_trees: T = ClassificationTree(oblique = oblique) T.initialize_from_CART(X, labels, tree) tao_opt(T, X, labels) trees.append(T) ClassificationTree.build_idxs_of_subtree(X, range(len(labels)), T.tree[0], oblique) #ClassificationTree.restore_tree(T) #multi_optimize_tao(trees, X, labels) best_loss = np.inf best_tree = None for i in range(n_iter): print("Iter: ", i) #multi_optimize_evolution(trees, X, labels, CR) for tree in trees: #optimize_evolution(tree, trees, X, labels, X_valid, y_valid, CR) trial = mutation(tree, trees, CR, X, labels, depth) tao_opt(trial, X, labels) trial_loss = regularized_loss(trial.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l) loss = regularized_loss(tree.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l) if trial_loss < loss: tree = trial #print("migliore") if loss < best_loss: best_loss = loss best_tree = tree print ("best loss: ", best_loss) print("loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique)) print("loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique)) print("ritorno loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique)) print("ritono loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique)) return best_tree, best_loss
X_valid = data[valid_id:] y_valid = label[valid_id:] ''' X_train, X_valid, y_train, y_valid = train_test_split(data, label, stratify=label, test_size=0.2) clf = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf=10) clf.fit(X_train, y_train) T = ClassificationTree(oblique=False) T.initialize_from_CART(X_train, y_train, clf) T.compute_prob(X_train, y_train) cart_auc_train += T.auc(X_train, y_train) cart_auc_valid += T.auc(X_valid, y_valid) #tao_train_score+=1-T.misclassification_loss(X_train, y_train, T.tree[0]) #print ("score before: ", tao_train_score) #x = data[8] #print (T.predict_label(x.reshape((1, -1)), 0)) #print (clf.predict(x.reshape((1, -1)))) #print ("x--->", x) #print(T.get_path_to(x, 0)) #T.print_tree_structure() #print ("T acc -> ", 1-T.misclassification_loss(data, label, T.tree[0])) #print ("clf acc -> ", clf.score(data, label)) #node_id = 4
"node %s." % ( actual_node.depth * "\t", actual_node.id, actual_node.parent_id, actual_node.left_node_id, actual_node.feature, actual_node.threshold, actual_node.right_node_id, )) stack.append(actual_node.left_node) stack.append(actual_node.right_node) spear_train += 1 - zero_one_loss(node, X, labels) / len(labels) spear_valid += 1 - zero_one_loss(node, X_valid, y_valid) / len(y_valid) clf = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf=4) clf.fit(X, labels) clf_train += clf.score(X, labels) clf_valid += clf.score(X_valid, y_valid) L = ClassificationTree(oblique=False) L.initialize_from_CART(X, labels, clf) L.print_tree_structure() print("clf train: ", clf_train / 30) print("spearman train: ", spear_train / 30) print("clf valid: ", clf_valid / 30) print("spearman valid: ", spear_valid / 30)
def test(n_runs, ls_train, ls_test, svm_train, svm_test, random_train, random_test, cart_train, tao_train, global_train, cart_test, tao_test, global_test): for run in range(n_runs): depth = 3 oblique = False n_trees = 200 n_iter = 5 data = np.load('cancer_train.npy') y = np.load('cancer_label.npy') print ("Run -> ", run) idx = np.random.permutation(len(data)) data = data[idx] y = y[idx] train_split = 0.50 valid_split = 0.75 #data = dataset.data[idx] #label = dataset.target[idx] train_id = int(len(data)*train_split) valid_id = int(len(data)*valid_split) X = data[0:train_id] labels = y[0:train_id] X_valid = data[train_id:valid_id] y_valid = y[train_id:valid_id] X_test = data[valid_id:] y_test = y[valid_id:] #CART clf = DecisionTreeClassifier(random_state=0, max_depth=depth, min_samples_leaf=4) clf.fit(X, labels) #TAO T = ClassificationTree(oblique = oblique) T.initialize_from_CART(X, labels, clf) tao = TAO(T) tao.evolve(X, labels) T.print_tree_structure() #LS ''' L = ClassificationTree(oblique = oblique) L.initialize_from_CART(X, labels, clf) ls = LocalSearch(L) ls.evolve(X, labels, alfa=1000000, max_iteration=10) ''' #SVM svm = LinearSVC(tol=1e-6, max_iter=10000, dual=False) svm.fit(X, labels) #RandomForest random_for = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4) random_for.fit(X, labels) #Genetic best_t, best_loss = genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR = 0, l = 0) #best_t.print_tree_structure() best_t.print_tree_structure() #Train Score cart_train.append(clf.score(X, labels)) #ls_train.append(1-ClassificationTree.misclassification_loss(L.tree[0], X, labels, range(len(labels)), oblique)) tao_train.append(1-ClassificationTree.misclassification_loss(T.tree[0], X, labels, range(len(labels)), oblique)) global_train.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X, labels, range(len(labels)), oblique)) svm_train.append(svm.score(X, labels)) random_train.append(random_for.score(X, labels)) #Test Score cart_test.append(clf.score(X_test, y_test)) #ls_test.append(1-ClassificationTree.misclassification_loss(L.tree[0], X_test, y_test, range(len(y_test)), oblique)) tao_test.append(1-ClassificationTree.misclassification_loss(T.tree[0], X_test, y_test, range(len(y_test)), oblique)) global_test.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X_test, y_test, range(len(y_test)), oblique)) svm_test.append(svm.score(X_test, y_test)) random_test.append(random_for.score(X_test, y_test))