def best_svm_split(self, node, X, y, alfa, complexity): T = self.classification_tree #Creo un nuovo sottoalbero fittizio con root nel nodo new_node = TreeNode.copy_node(node) #if new_node.is_leaf: #complexity += 1 rho = 0 data = X[new_node.data_idxs] label = y[new_node.data_idxs] if not all(i == label[0] for i in label) and len(data) > 0: #Questo fa SVM multiclasse 1 vs rest clf = LinearSVC(tol=1e-6, C=10, max_iter=10000, loss='squared_hinge', penalty='l1', dual=False) clf.fit(data, label) #for n_class in range(n_classes): #n_misclassified = np.count_nonzero(label-np.sign(np.dot(data, clf.coef_[n_class].T)+clf.intercept_[n_class])) #Devo capire come ottenere i coefficienti migliori tra il numero di iperpiani addestrati weights = clf.coef_.reshape((len(X[0])), ) intercept = clf.intercept_ if new_node.is_leaf: ClassificationTree.create_new_children(node, X, y, self.max_id, None, None, oblique=T.oblique, weights=weights, intercept=intercept) rho = 1 else: new_node.weights = weights new_node.intercept = intercept return new_node, ClassificationTree.misclassification_loss( new_node, X, y, new_node.data_idxs, T.oblique) + alfa * (complexity + rho) return new_node, np.inf
def best_split(node, X, labels): error_best = np.inf j = node.feature vals = {} best_node = None for point_idx in node.data_idxs: vals[point_idx] = X[point_idx, j] values = sorted(vals.items(), key=lambda x: x[1]) sorted_indexes = [tuple[0] for tuple in values] #print ("len sort indx: ", len(sorted_indexes)) thresh = 0.5 * X[sorted_indexes[0], j] node.threshold = thresh ClassificationTree.create_new_children(node, X, labels, node.id, j, thresh, False) actual_loss = zero_one_loss(node, X, labels) #Ciclo su ogni valore di quella componente e provo tutti gli split #possibili i = 0 while i < len(sorted_indexes): if i < len(sorted_indexes) - 1: thresh = 0.5 * (X[sorted_indexes[i], j] + X[sorted_indexes[i + 1], j]) else: thresh = 1.5 * X[sorted_indexes[i], j] node.threshold = thresh ClassificationTree.create_new_children(node, X, labels, node.id, j, thresh, False) actual_loss = zero_one_loss(node, X, labels) if actual_loss < error_best: error_best = actual_loss best_left = node.left_node best_right = node.right_node best_t = thresh i += 1 return best_t, best_left, best_right
def best_parallel_split(self, node, X, y, alfa, complexity): T = self.classification_tree #Creo un nuovo sottoalbero fittizio con root nel nodo new_node = TreeNode.copy_node(node) error_best = np.inf #if new_node.is_leaf: #complexity += 1 was_leaf = False improve = False rho = 0 if new_node.is_leaf: was_leaf = True rho = 1 if new_node.data_idxs: for j in range(len(X[0])): #Prendo tutte le j-esime componenti del dataset e le ordino #in modo crescente vals = {} for point_idx in new_node.data_idxs: vals[point_idx] = X[point_idx, j] values = sorted(vals.items(), key=lambda x: x[1]) sorted_indexes = [tuple[0] for tuple in values] #plt.scatter(X[sorted_indexes, j], range(len(values)), s=0.4, c=list(correct_classification_tuples.values())) #plt.show() new_node.feature = j #if j==2: #base = actual_loss #actual_loss = self.binary_loss(node_id, X, y, sorted_indexes[i], correct_classification_tuples[sorted_indexes[i]], actual_loss, thresh) #print ("loss: ", actual_loss, "n punti: ", len(care_points_idxs)) #print("vecchia: ", self.vecchia_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh)) #Ciclo su ogni valore di quella componente e provo tutti gli split #possibili ''' new_node.threshold = 0.5*X[sorted_indexes[0], j] ''' i = -1 actual_loss = ClassificationTree.misclassification_loss( new_node, X, y, sorted_indexes, self.classification_tree. oblique) + alfa * (complexity + rho) while i < len(sorted_indexes): pre_thresh = new_node.threshold #print("Ottimizzo best parallel: ", i*100/len(sorted_indexes)) if i < 0: thresh = 0.5 * X[sorted_indexes[0], j] if i < len(sorted_indexes) - 1: thresh = 0.5 * (X[sorted_indexes[i], j] + X[sorted_indexes[i + 1], j]) else: thresh = 1.5 * X[sorted_indexes[i], j] new_node.threshold = thresh ''' #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie #queste vengono ottimizzate subito per maggioranza if was_leaf: self.create_new_children(new_node, j, thresh) inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh) actual_loss += inc else: inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh) actual_loss += inc ''' #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie #queste vengono ottimizzate subito per maggioranza if was_leaf: ClassificationTree.create_new_children( new_node, X, y, self.max_id, j, thresh, oblique=T.oblique) actual_loss = ClassificationTree.misclassification_loss( new_node, X, y, sorted_indexes, self.classification_tree.oblique) + alfa * ( complexity + rho) if actual_loss < error_best: improve = True error_best = actual_loss best_t = thresh best_feature = j i += 1 #print ("error best: ", error_best) new_node.threshold = best_t new_node.feature = best_feature if was_leaf and improve: self.max_id += 2 return new_node, error_best