コード例 #1
0
class SimBinarySVM:
    def __init__(self, gamma=0.1, C=1.0, verbose=False):
        self.gamma = gamma
        self.C = C
        self.verbose = verbose
        self.tools = None

    # this use precomputed kernel matrix
    def make_gram_matrix(self, vectors, gamma):
        if self.verbose:
            start_time = time.process_time()
        matrix = sklearn.metrics.pairwise.rbf_kernel(vectors, gamma=gamma)
        if self.verbose:
            print('gram matrix: %.4f' % (time.process_time() - start_time))

        def rbf(a, b):
            # vector a, b need to have index at the first element
            return matrix[int(a[0])][int(b[0])]

        return rbf

    def _find_separability(self, training_classes):
        # create a matrix list and give them indexes
        vectors = []
        training_classes_with_idx = {}
        idx = 0
        for name, points in training_classes.items():
            this_class = training_classes_with_idx[name] = []
            for point in points:
                # give it an index
                vector = point.tolist()
                vector_with_idx = [idx] + vector
                idx += 1
                vectors.append(vector)
                this_class.append(vector_with_idx)
            training_classes_with_idx[name] = numpy.array(this_class)

        vectors = numpy.array(vectors)
        kernel = self.make_gram_matrix(vectors, self.gamma)
        self.tools = Tools(kernel)

        # calculate all the sqRadiuses
        if self.verbose:
            start_time = time.process_time()
        sq_radiuses = {}
        for name, points in training_classes_with_idx.items():
            sq_radiuses[name] = self.tools.squared_radius(name, points)
        if self.verbose:
            print('sq_radiuses: %.4f' % (time.process_time() - start_time))

        # separability section
        # use the precalculated squared radiuses from above
        def find_separability(name_a, name_b):
            sq_ra = sq_radiuses[name_a]
            sq_rb = sq_radiuses[name_b]
            sq_dist = self.tools.squared_distance(
                name_a,
                training_classes_with_idx[name_a],
                name_b,
                training_classes_with_idx[name_b],
            )

            return sq_dist / (sq_ra + sq_rb)

        # create mapping function from labels to integers and vice ver
        class_cnt = len(training_classes.keys())
        label_to_int = {}
        int_to_label = [None for i in range(class_cnt)]
        for i, label in enumerate(training_classes.keys()):
            label_to_int[label] = i
            int_to_label[i] = label

        # 2d matrix showing separability of each
        if self.verbose:
            start_time = time.process_time()
        separability = numpy.empty((class_cnt, class_cnt))
        separability.fill(float('inf'))
        for i, a in enumerate(training_classes.keys()):
            int_a = label_to_int[a]
            # should be no separability with itself
            separability[int_a][int_a] = 0
            for b in list(training_classes.keys())[i + 1:]:
                int_b = label_to_int[b]
                separability[int_a][int_b] = separability[int_b][
                    int_a] = find_separability(a, b)
        if self.verbose:
            print('separability: %.4f' % (time.process_time() - start_time))

        return separability, label_to_int, int_to_label

    def _construct_mst_graph(self, training_classes, separability):
        # construct a graph, and find its MST
        class_cnt = len(training_classes.keys())
        mesh = graph.Graph(class_cnt)
        for i, row in enumerate(separability):
            for j, sep in enumerate(row):
                mesh.link(i, j, sep)
        # find its MST
        mst_list = mesh.mst()
        mst_list.sort(key=lambda x: -x[2])
        # print(mst_list)

        # creat a graph of MST()
        mst_graph = graph.Graph(class_cnt)
        for link in mst_list:
            mst_graph.double_link(link[0], link[1], link[2])
        return mst_graph, mst_list

    def _construct_tree(self, mst_graph, mst_list):
        tree = binarytree.BinaryTree()
        # the root of the tree is a list of every node
        tree.add_root(binarytree.BinaryTreeNode(mst_graph.connected_with(0)))

        for link in mst_list:
            # remove this link
            mst_graph.double_unlink(link[0], link[1])
            parent = None
            # find where the link in the binary tree
            parent = tree.find(link[0])
            # explode this binarytree node into two
            left = binarytree.BinaryTreeNode(mst_graph.connected_with(link[0]))
            right = binarytree.BinaryTreeNode(mst_graph.connected_with(
                link[1]))
            tree.add_left(parent, left)
            tree.add_right(parent, right)
        return tree

    def train(self, training_classes):
        (self.separability, self.label_to_int, self.int_to_label) = \
            (separability, label_to_int, int_to_label) = self._find_separability(training_classes)

        self.class_cnt = class_cnt = len(training_classes.keys())

        (self.mst_graph,
         self.mst_list) = (mst_graph, mst_list) = self._construct_mst_graph(
             training_classes, separability)

        # recursively disconnect the largest distance link of the MST
        self.tree = tree = self._construct_tree(mst_graph, mst_list)

        # create SVMs according to this tree
        # train svm ..
        svm_cnt = 0

        def train(training_classes):
            # svm must be recursively trained
            def runner(current, universe):
                # if the current has no children, cannot separate anymore
                if current.left == None and current.right == None:
                    return

                # details of training is here
                left_class = {}
                right_class = {}
                for class_name, class_samples in universe.items():
                    # decide if this label is left hand side or right ?
                    if class_name in current.left.val:
                        # it belongs to the left group
                        left_class[class_name] = class_samples
                    else:
                        # add the class into the dropbox
                        right_class[class_name] = class_samples

                # the label of the left side is '0'
                # the lable of the right side is '1'
                training = []
                label = []

                for class_name, class_samples in left_class.items():
                    samples = class_samples.tolist()
                    training += samples
                    label += [0 for i in samples]

                for class_name, class_samples in right_class.items():
                    samples = class_samples.tolist()
                    training += samples
                    label += [1 for i in samples]

                training = numpy.array(training)
                label = numpy.array(label)

                svm = sklearn.svm.SVC(kernel='rbf', gamma=self.gamma,
                                      C=self.C).fit(training, label)
                nonlocal svm_cnt
                svm_cnt += 1
                # we will use the 'svm' attribute of each node (arbitrarily added)
                current.svm = svm

                runner(current.left, left_class)
                runner(current.right, right_class)

            # start training from the tree's root
            universe = {}
            for key, val in training_classes.items():
                universe[self.label_to_int[key]] = val
            runner(tree.root, universe)

        # the result is stored in the tree , self.tree
        if self.verbose:
            start_time = time.process_time()
        train(training_classes)
        if self.verbose:
            print('train: %.4f' % (time.process_time() - start_time))
        return svm_cnt

    def predict(self, sample):
        iterations = 0

        def runner(current):
            # if it is the leaf of the tree, return its value
            if current.left == None and current.right == None:
                return current.val[0]

            prediction = current.svm.predict(sample)
            nonlocal iterations
            iterations += 1

            if prediction[0] == 0:
                # goes left
                return runner(current.left)
            else:
                # goes right
                return runner(current.right)

        return self.int_to_label[runner(self.tree.root)], iterations

    def test(self, testing_classes):
        total = 0
        errors = 0
        total_itr = 0

        for class_name, tests in testing_classes.items():
            for test in tests:
                total += 1
                prediction, iterations = self.predict(test)
                total_itr += iterations
                if prediction != class_name:
                    errors += 1

        return total, errors, total_itr

    def cross_validate(self, folds, training_classes):
        total = 0
        for key, val in training_classes.items():
            total += val.size

        acc_total = 0
        acc_errors = 0
        for i in range(folds):
            training = {}
            testing = {}
            # select a portion to be left
            no = 0
            training_cnt = 0
            testing_cnt = 0
            for class_name, class_samples in training_classes.items():
                training[class_name] = []
                testing[class_name] = []
                for sample in class_samples:
                    if no % folds != i:
                        # this is in
                        training[class_name].append(sample)
                        training_cnt += 1
                    else:
                        # keep this for testing
                        testing[class_name].append(sample)
                        testing_cnt += 1
                    no += 1
                training[class_name] = numpy.array(training[class_name])
                testing[class_name] = numpy.array(testing[class_name])

                # print('training: ', 'name: ', class_name, 'size: ', training[class_name].size)
                # print('testing: ', 'name: ', class_name, 'size: ', testing[class_name].size)

            # train the rest
            self.train(training)

            # test with the leftover
            test_result = self.test(testing)
            acc_total += test_result[0]
            acc_errors += test_result[1]

        # average the error
        cross_validation_error = acc_errors / acc_total
        return cross_validation_error, acc_total, acc_errors
コード例 #2
0
class SimMultiSVM:
    def __init__(self, gamma=0.1, C=1, verbose=False):
        self.label_to_int = None
        self.int_to_label = None
        self.tree = None
        self.gamma = gamma
        self.C = C
        self.verbose = verbose
        self.tools = None

    # this use precomputed kernel matrix
    def make_gram_matrix(self, vectors, gamma):
        if self.verbose:
            start_time = time.process_time()
        matrix = sklearn.metrics.pairwise.rbf_kernel(vectors, gamma=gamma)
        if self.verbose:
            print('gram matrix: %.4f' % (time.process_time() - start_time))

        def rbf(a, b):
            # vector a, b need to have index at the first element
            return matrix[int(a[0])][int(b[0])]
        return rbf

    def _find_separability(self, training_classes):
        # create a matrix list and give them indexes
        vectors = []
        training_classes_with_idx = {}
        idx = 0
        for name, points in training_classes.items():
            this_class = training_classes_with_idx[name] = []
            for point in points:
                # give it an index
                vector = point.tolist()
                vector_with_idx = [idx] + vector
                idx += 1
                vectors.append(vector)
                this_class.append(vector_with_idx)
            training_classes_with_idx[name] = numpy.array(this_class)

        vectors = numpy.array(vectors)
        kernel = self.make_gram_matrix(vectors, self.gamma)
        self.tools = Tools(kernel)

        # find radius of each class
        if self.verbose:
            start_time = time.process_time()
        sq_radiuses = {}
        for name, points in training_classes_with_idx.items():
            sq_radiuses[name] = self.tools.squared_radius(name, points)
        if self.verbose:
            print('train: %.4f' % (time.process_time() - start_time))

        def find_separability(a, b):
            sq_ra = sq_radiuses[a]
            sq_rb = sq_radiuses[b]
            sq_dist = self.tools.squared_distance(
                a,
                training_classes_with_idx[a],
                b,
                training_classes_with_idx[b],
            )
            return sq_dist / (sq_ra + sq_rb)

        # relabelling
        class_cnt = len(training_classes.keys())
        label_to_int = {}
        int_to_label = [None for i in range(class_cnt)]
        for i, label in enumerate(training_classes.keys()):
            label_to_int[label] = i
            int_to_label[i] = label

        # find separability of each pair
        # default value is very high separability
        if self.verbose:
            start_time = time.process_time()
        separability = numpy.empty((class_cnt, class_cnt))
        separability.fill(float('inf'))
        for i, a in enumerate(training_classes.keys()):
            int_a = label_to_int[a]
            # should be no separability with itself
            separability[int_a][int_a] = 0
            for b in list(training_classes.keys())[i + 1:]:
                int_b = label_to_int[b]
                separability[int_a][int_b] = separability[int_b][int_a] = find_separability(a, b)
        if self.verbose:
            print('train: %.4f' % (time.process_time() - start_time))

        return separability, label_to_int, int_to_label

    def train(self, training_classes):
        separability, label_to_int, int_to_label = self._find_separability(training_classes)
        # create a mesh
        class_cnt = len(training_classes.keys())
        mesh = Graph(class_cnt)
        for i, row in enumerate(separability):
            for j, sep in enumerate(row):
                mesh.link(i, j, sep)

        # create the mst of this mesh
        mst_list = mesh.mst()
        mst_graph = Graph(class_cnt)
        for link in mst_list:
            mst_graph.double_link(link[0], link[1], link[2])

        # recursively remove links (that are greater than the average of the mst)
        # at the same time create the binary tree
        tree = MultiTree()
        # the root node is the node of all members, assume that all connected with 0
        all_classes = mst_graph.connected_with(0)
        tree.add_root(MultiTreeNode(all_classes))

        def runner(current):
            # loop to remove every link that larger than average
            sum_weight, edges = mst_graph.sum_weight(current.val[0])
            # terminate when there is not edge to go
            if len(edges) is 0:
                return
            # sort weight desc
            edges.sort(key=lambda x: -x[2])
            avg_weight = sum_weight / len(edges)

            btree = BinaryTree()
            btree.add_root(BinaryTreeNode(current.val))
            for edge in edges:
                # remove this link
                if edge[2] >= avg_weight:
                    # remove the link
                    mst_graph.double_unlink(edge[0], edge[1])
                    # add this link to the binary tree
                    parent = btree.find(edge[0])
                    # if edge[0] in left.val:
                    #     parent = left
                    # else:
                    #     parent = right
                    left = btree.add_left(parent, BinaryTreeNode(mst_graph.connected_with(edge[0])))
                    right = btree.add_right(parent, BinaryTreeNode(mst_graph.connected_with(edge[1])))
                # else or the last one
                if edge[2] < avg_weight or edge == edges[-1]:
                    # groups are the display output of the btree
                    groups = btree.leaves()
                    for group in groups:
                        new_node = MultiTreeNode(group)
                        # add new group to the leat of the multi tree
                        tree.add_child(current, new_node)
                        # recursively run it
                        runner(new_node)
                    # if the link's weight has become smaller than avg_weight,
                    # there is no need to keep going on
                    break

        if self.verbose:
            start_time = time.process_time()
        runner(tree.root)
        if self.verbose:
            print('train: %.4f' % (time.process_time() - start_time))


        # now got the tree
        # train svm according to the mulitree
        svm_cnt = 0
        def train(training_classes):
            def runner(current, universe):
                if current.children == None:
                    return

                child_universes = [{} for each in current.children]
                for class_name, samples in universe.items():
                    for i, child in enumerate(current.children):
                        # the class belongs to this child
                        if class_name in child.val:
                            child_universes[i][class_name] = samples

                current.svms = [None for child in current.children]
                # one against the rest method
                for i, child in enumerate(current.children):
                    training = []
                    labels = []

                    for class_int, samples in universe.items():
                        # class in this child is marked as 0
                        if class_int in child.val:
                            training += samples.tolist()
                            labels += [0 for each in samples]
                        else:
                            # put to one labeled box
                            training += samples.tolist()
                            labels += [1 for each in samples]

                    training = numpy.array(training)
                    labels = numpy.array(labels)

                    # train the svms
                    # using one against the rest method
                    current.svms[i] = sklearn.svm.SVC(kernel='rbf', gamma=self.gamma, C=self.C) \
                        .fit(training, labels)
                    nonlocal svm_cnt
                    svm_cnt += 1
                    # the recursive part
                    runner(child, child_universes[i])
            # relabel all the classes to int based
            universe = {}
            for key, val in training_classes.items():
                universe[label_to_int[key]] = val
            runner(tree.root, universe)

        train(training_classes)
        # make these vars visible to the outsiders
        self.tree = tree
        self.int_to_label = int_to_label
        self.label_to_int = label_to_int
        return svm_cnt

    def predict(self, sample):
        iterations = 0
        def runner(current):
            if current.children is None:
                return current.val[0]
            # use confidence score
            confidence = [svm.decision_function(sample) for svm in current.svms]
            nonlocal iterations
            iterations += len(current.svms)
            # since the more the confidence is the more likely its gonna be '1' class
            # so we find the minimum to find the most likely to be '0' class
            min_pos, min_val = min(enumerate(confidence), key=lambda x: x[1])
            # recursively call down the tree
            return runner(current.children[min_pos])

        return self.int_to_label[runner(self.tree.root)], iterations

    def test(self, testing_classes):
        total = 0
        errors = 0
        total_itr = 0

        for class_name, tests in testing_classes.items():
            for test in tests:
                total += 1
                prediction, iterations = self.predict(test)
                total_itr += iterations
                if prediction != class_name:
                    errors += 1

        return total, errors, total_itr

    def cross_validate(self, folds, training_classes):
        acc_total = 0
        acc_errors = 0
        for fold in range(folds):
            training = {}
            testing = {}
            # select a portion to be left
            i = 0
            training_cnt = 0
            testing_cnt = 0
            for name, samples in training_classes.items():
                training[name] = []
                testing[name] = []
                for sample in samples:
                    if i % folds != fold:
                        # this is in
                        training[name].append(sample)
                        training_cnt += 1
                    else:
                        # keep this for testing
                        testing[name].append(sample)
                        testing_cnt += 1
                    i += 1
                training[name] = numpy.array(training[name])
                testing[name] = numpy.array(testing[name])

                # print('training: ', 'name: ', class_name, 'size: ', training[class_name].size)
                # print('testing: ', 'name: ', class_name, 'size: ', testing[class_name].size)

            # train the rest
            self.train(training)

            # test with the leftover
            test_result = self.test(testing)
            acc_total += test_result[0]
            acc_errors += test_result[1]

        # average the error
        cross_validation_error = acc_errors / acc_total
        return cross_validation_error, acc_total, acc_errors