예제 #1
0
 def best_svm_split(self, node, X, y, alfa, complexity):
     T = self.classification_tree
     #Creo un nuovo sottoalbero fittizio con root nel nodo
     new_node = TreeNode.copy_node(node)
     #if new_node.is_leaf:
     #complexity += 1
     rho = 0
     data = X[new_node.data_idxs]
     label = y[new_node.data_idxs]
     if not all(i == label[0] for i in label) and len(data) > 0:
         #Questo fa SVM multiclasse 1 vs rest
         clf = LinearSVC(tol=1e-6,
                         C=10,
                         max_iter=10000,
                         loss='squared_hinge',
                         penalty='l1',
                         dual=False)
         clf.fit(data, label)
         #for n_class in range(n_classes):
         #n_misclassified = np.count_nonzero(label-np.sign(np.dot(data, clf.coef_[n_class].T)+clf.intercept_[n_class]))
         #Devo capire come ottenere i coefficienti migliori tra il numero di iperpiani addestrati
         weights = clf.coef_.reshape((len(X[0])), )
         intercept = clf.intercept_
         if new_node.is_leaf:
             ClassificationTree.create_new_children(node,
                                                    X,
                                                    y,
                                                    self.max_id,
                                                    None,
                                                    None,
                                                    oblique=T.oblique,
                                                    weights=weights,
                                                    intercept=intercept)
             rho = 1
         else:
             new_node.weights = weights
             new_node.intercept = intercept
         return new_node, ClassificationTree.misclassification_loss(
             new_node, X, y, new_node.data_idxs,
             T.oblique) + alfa * (complexity + rho)
     return new_node, np.inf
예제 #2
0
def best_split(node, X, labels):
    error_best = np.inf
    j = node.feature

    vals = {}
    best_node = None
    for point_idx in node.data_idxs:
        vals[point_idx] = X[point_idx, j]

    values = sorted(vals.items(), key=lambda x: x[1])
    sorted_indexes = [tuple[0] for tuple in values]
    #print ("len sort indx: ", len(sorted_indexes))
    thresh = 0.5 * X[sorted_indexes[0], j]
    node.threshold = thresh
    ClassificationTree.create_new_children(node, X, labels, node.id, j, thresh,
                                           False)

    actual_loss = zero_one_loss(node, X, labels)
    #Ciclo su ogni valore di quella componente e provo tutti gli split
    #possibili
    i = 0
    while i < len(sorted_indexes):
        if i < len(sorted_indexes) - 1:
            thresh = 0.5 * (X[sorted_indexes[i], j] +
                            X[sorted_indexes[i + 1], j])
        else:
            thresh = 1.5 * X[sorted_indexes[i], j]
        node.threshold = thresh
        ClassificationTree.create_new_children(node, X, labels, node.id, j,
                                               thresh, False)

        actual_loss = zero_one_loss(node, X, labels)
        if actual_loss < error_best:
            error_best = actual_loss
            best_left = node.left_node
            best_right = node.right_node
            best_t = thresh
        i += 1

    return best_t, best_left, best_right
예제 #3
0
    def best_parallel_split(self, node, X, y, alfa, complexity):

        T = self.classification_tree

        #Creo un nuovo sottoalbero fittizio con root nel nodo
        new_node = TreeNode.copy_node(node)
        error_best = np.inf
        #if new_node.is_leaf:
        #complexity += 1
        was_leaf = False
        improve = False
        rho = 0
        if new_node.is_leaf:
            was_leaf = True
            rho = 1

        if new_node.data_idxs:
            for j in range(len(X[0])):

                #Prendo tutte le j-esime componenti del dataset e le ordino
                #in modo crescente
                vals = {}
                for point_idx in new_node.data_idxs:
                    vals[point_idx] = X[point_idx, j]

                values = sorted(vals.items(), key=lambda x: x[1])
                sorted_indexes = [tuple[0] for tuple in values]

                #plt.scatter(X[sorted_indexes, j], range(len(values)), s=0.4, c=list(correct_classification_tuples.values()))
                #plt.show()
                new_node.feature = j
                #if j==2:
                #base = actual_loss
                #actual_loss = self.binary_loss(node_id, X, y, sorted_indexes[i], correct_classification_tuples[sorted_indexes[i]], actual_loss, thresh)
                #print ("loss: ", actual_loss, "n punti: ", len(care_points_idxs))
                #print("vecchia: ", self.vecchia_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh))
                #Ciclo su ogni valore di quella componente e provo tutti gli split
                #possibili
                '''
                new_node.threshold = 0.5*X[sorted_indexes[0], j]
                '''
                i = -1
                actual_loss = ClassificationTree.misclassification_loss(
                    new_node, X, y, sorted_indexes, self.classification_tree.
                    oblique) + alfa * (complexity + rho)
                while i < len(sorted_indexes):

                    pre_thresh = new_node.threshold
                    #print("Ottimizzo best parallel: ", i*100/len(sorted_indexes))

                    if i < 0:
                        thresh = 0.5 * X[sorted_indexes[0], j]

                    if i < len(sorted_indexes) - 1:
                        thresh = 0.5 * (X[sorted_indexes[i], j] +
                                        X[sorted_indexes[i + 1], j])
                    else:
                        thresh = 1.5 * X[sorted_indexes[i], j]

                    new_node.threshold = thresh
                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        self.create_new_children(new_node, j, thresh)
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc
                    else:
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc

                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        ClassificationTree.create_new_children(
                            new_node,
                            X,
                            y,
                            self.max_id,
                            j,
                            thresh,
                            oblique=T.oblique)

                    actual_loss = ClassificationTree.misclassification_loss(
                        new_node, X, y, sorted_indexes,
                        self.classification_tree.oblique) + alfa * (
                            complexity + rho)

                    if actual_loss < error_best:
                        improve = True
                        error_best = actual_loss
                        best_t = thresh
                        best_feature = j
                    i += 1

            #print ("error best: ", error_best)
            new_node.threshold = best_t
            new_node.feature = best_feature
            if was_leaf and improve:
                self.max_id += 2
        return new_node, error_best