class TestOAOSVM(TestCase):
    training_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/simbinarysvm/satimage/sat-train-s.csv'
    training_set = Dataset.load(training_file)
    training_classes = Dataset.split(training_set)
    class_cnt = len(training_classes.keys())
    gamma = 0.1
    svm = OAOSVM(gamma=gamma)

    def test_train(self):
        self.svm.train(self.training_classes)

    def test_predict(self):
        errors = 0
        total = 0
        for class_name, class_samples in self.training_classes.items():
            for sample in class_samples:
                total += 1
                if self.svm.predict(sample) != class_name:
                    # wrong prediction
                    errors += 1
        # just to see the idea
        print('errors:', errors, ' total:', total)
        assert errors == 0

    def test_cross_validate(self):
        # 10 folds validation
        res = self.svm.cross_validate(10, self.training_classes)
        # this just to get the idea
        assert res == 0
Exemple #2
0
class TestGroup(TestCase):
    training_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/simbinarysvm/satimage/sat-train-s.csv'
    training_set = Dataset.load(training_file)
    training_classes = Dataset.split(training_set)
    class_cnt = len(training_classes.keys())

    def test___init__(self):
        pass
class TestSimMultiSVM(TestCase):
    training_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/simbinarysvm/satimage/sat-train-s.csv'
    training_set = Dataset.load(training_file)
    training_classes = Dataset.split(training_set)
    class_cnt = len(training_classes.keys())
    gamma = 0.1
    svm = SimMultiSVM(gamma=gamma)

    def test__find_separability(self):
        # svm = SimBinarySVM(Kernel)
        (self.svm.separability, self.svm.label_to_int,
         self.svm.int_to_label) = self.svm._find_separability(
             self.training_classes)
        # print('similarity', similarity)
        assert self.svm.separability.size == self.class_cnt * self.class_cnt
        assert self.svm.separability[0].size == self.class_cnt

        # print('labelToINt:', labelToInt)
        assert len(self.svm.label_to_int.keys()) == 6

        # print('int_to_label', int_to_label)
        for idx, val in enumerate(self.svm.int_to_label):
            assert self.svm.label_to_int[val] == idx

    @pytest.mark.run(after='test__find_similarity')
    def test_Train(self):
        self.svm.train(self.training_classes)

        def runner(current):
            if current.children == None:
                return

            assert len(current.svms) == len(current.children)
            for child in current.children:
                runner(child)

        runner(self.svm.tree.root)

    @pytest.mark.run(after='test_train')
    def test_predict(self):
        errors = 0
        total = 0
        for class_name, class_samples in self.training_classes.items():
            for sample in class_samples:
                total += 1
                if self.svm.predict(sample) != class_name:
                    # wrong prediction
                    errors += 1
        # just to see the idea
        print('errors:', errors, ' total:', total)
        assert errors == 0

    @pytest.mark.run(after='test_predict')
    def test_cross_validate(self):
        # 10 folds validation
        res = self.svm.cross_validate(10, self.training_classes)
        # this just to get the idea
        assert res == 0
class TestDataset(TestCase):
    file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/simbinarysvm/iris.csv'
    dataset = Dataset.load(file)
    splitted = Dataset.split(dataset)

    def test_Load(self):
        assert len(self.dataset.features[0]) == 4

    def test_Split(self):
        assert len(self.splitted.keys()) == 3

        sum_splitted = 0
        for name, members in self.splitted.items():
            sum_splitted += len(members)
            for each in members:
                assert len(each) == 4
        assert sum_splitted == len(self.dataset.features)
    def _find_separability(self, training_classes):
        find_squared_distance = Dataset.squared_distance_maker()

        sq_radiuses = {}
        for name, points in training_classes.items():
            sq_radiuses[name] = Dataset.squared_radius(points, self.kernel)

        def find_separability(a, b):
            sq_ra = sq_radiuses[a]
            sq_rb = sq_radiuses[b]
            sq_dist = find_squared_distance(
                a,
                training_classes[a],
                b,
                training_classes[b],
                self.kernel,
            )
            return sq_dist / (sq_ra + sq_rb)

        class_cnt = len(training_classes.keys())
        label_to_int = {}
        int_to_label = [None for i in range(class_cnt)]

        for i, label in enumerate(training_classes.keys()):
            label_to_int[label] = i
            int_to_label[i] = label

        # default value is very high separability
        separability = numpy.empty((class_cnt, class_cnt))
        separability.fill(float('inf'))
        for i, a in enumerate(training_classes.keys()):
            int_a = label_to_int[a]
            # should be no separability with itself
            separability[int_a][int_a] = 0
            for b in list(training_classes.keys())[i + 1:]:
                int_b = label_to_int[b]
                separability[int_a][int_b] = separability[int_b][int_a] = find_separability(a, b)

        return separability, label_to_int, int_to_label
Exemple #6
0
print('creating svm and testing with supplied test data')

num_workers = multiprocessing.cpu_count()
print('workers: ', num_workers)

training_files = [
    ('satimage', 'satimage/sat-train-s.csv', 'satimage/sat-test.csv'),
]

for training in training_files:
    project_name = training[0]
    print('working on project: ', project_name)

    # load dataset
    training_file = training[1]
    training_set = Dataset.load(training_file)
    training_classes = Dataset.split(training_set)

    testing_file = training[2]
    testing_set = Dataset.load(testing_file)
    testing_classes = Dataset.split(testing_set)

    best = {}

    for each in (
            ('OAO', OAOSVM),
            ('SimBinarySVM', SimBinarySVM),
            ('SimMultiSVM', SimMultiSVM),
    ):

        svm_type = each[0]
    ('letter', 'datasets/letter/letter-train.txt',
     'datasets/letter/letter-test.txt', lambda row: (row[1:], row[0])),
]

for training in training_files:
    project_name = training[0]
    print('working on project: ', project_name)

    # load dataset
    given_adapter = None
    if len(training) > 3:
        given_adapter = training[3]

    training_file = training[1]
    print('train: ', training_file)
    training_set = Dataset.load(training_file, adapter=given_adapter)
    training_classes = Dataset.split(training_set)

    testing_file = training[2]
    print('test:  ', testing_file)
    testing_set = Dataset.load(testing_file, adapter=given_adapter)
    testing_classes = Dataset.split(testing_set)

    best = {}
    avg = {}

    for each in (
        ('OAO', OAOSVM),
        ('OAA', OAASVM),
        ('SimMultiSVM', SimMultiSVM),
            # ('SimBinarySVM_ORI', SimBinarySVMORI),
Exemple #8
0
class TestSimBinarySVM(TestCase):
    training_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/simbinarysvm/satimage/sat-train-s.csv'
    training_set = Dataset.load(training_file)
    training_classes = Dataset.split(training_set)
    class_cnt = len(training_classes.keys())
    gamma = 1e-6
    C = 0.01
    svm = SimBinarySVM(gamma=gamma, C=C)

    # def test_MakeRBFKernel(self):
    #     self.fail()

    def test_find_separability(self):
        # svm = SimBinarySVM(Kernel)
        (self.svm.separability, self.svm.label_to_int,
         self.svm.int_to_label) = self.svm._find_separability(
             self.training_classes)
        # print('similarity', similarity)
        assert self.svm.separability.size == self.class_cnt * self.class_cnt
        assert self.svm.separability[0].size == self.class_cnt

        # print('labelToINt:', labelToInt)
        assert len(self.svm.label_to_int.keys()) == 6

        # print('intToLabel', intToLabel)
        for idx, val in enumerate(self.svm.int_to_label):
            assert self.svm.label_to_int[val] == idx

    @pytest.mark.run(after='test_find_separability')
    def test_construct_mst_graph(self):
        (self.svm.mst_graph,
         self.svm.mst_list) = self.svm._construct_mst_graph(
             self.training_classes, self.svm.separability)
        assert len(self.svm.mst_list) == self.class_cnt - 1
        assert len(self.svm.mst_graph.connected_with(0)) == self.class_cnt

        cnt = 0
        for i, row in enumerate(self.svm.mst_graph.connection):
            for j, dist in enumerate(row):
                if dist != float('inf'):
                    cnt += 1

        # the graph bidirectional
        assert cnt == (self.class_cnt - 1) * 2

    @pytest.mark.run(after='test_construct_mst_graph')
    def test_construct_tree(self):
        self.svm.tree = self.svm._construct_tree(self.svm.mst_graph,
                                                 self.svm.mst_list)

        def runner(current):
            if current.left is None and current.right is None:
                return

            assert len(
                current.val) == len(current.left.val) + len(current.right.val)

            assert set(current.val) == set(current.left.val +
                                           current.right.val)

            runner(current.left)
            runner(current.right)

        runner(self.svm.tree.root)

    @pytest.mark.run(after='test_construct_tree')
    def test_train(self):
        self.svm.train(self.training_classes)

        def runner(current):
            if current.left is None and current.right is None:
                return

            assert current.svm
            runner(current.left)
            runner(current.right)

        runner(self.svm.tree.root)

    @pytest.mark.run(after='test_train')
    def test_predict(self):
        errors = 0
        total = 0
        for class_name, class_samples in self.training_classes.items():
            for sample in class_samples:
                total += 1
                if self.svm.predict(sample) != class_name:
                    # wrong prediction
                    errors += 1
        # just to see the idea
        print('errors:', errors, ' total:', total)
        assert errors == 0

    @pytest.mark.run(after='test_predict')
    def test_cross_validate(self):
        # 10 folds validation
        res = self.svm.cross_validate(10, self.training_classes)
        # this just to get the idea
        assert res == 0

    def test_make_gram_matrix(self):
        gamma = 0.1
        vectors = []
        training_classes_with_idx = {}
        idx = 0
        for name, points in self.training_classes.items():
            this_class = training_classes_with_idx[name] = []
            for point in points:
                # give it an index
                vector = point.tolist()
                vector_with_idx = [idx] + vector
                idx += 1
                vectors.append(vector)
                this_class.append(vector_with_idx)
            training_classes_with_idx[name] = numpy.array(this_class)

        vectors = numpy.array(vectors)
        kernel = self.svm.make_gram_matrix(vectors, gamma)

        def original_kernel(a, b):
            import numpy
            return numpy.exp(-gamma * numpy.linalg.norm(a - b)**2)

        for class_name, samples in training_classes_with_idx.items():
            a = samples
            b = a[:].tolist()
            random.shuffle(b)
            b = numpy.array(b)

            for i in range(a.shape[0]):
                assert abs(
                    kernel(a[i], b[i]) -
                    original_kernel(a[i][1:], b[i][1:])) < 1e-5
Exemple #9
0
class TestSimBinarySVMORI(TestCase):
    training_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/simbinarysvm/satimage/sat-train-s.csv'
    training_set = Dataset.load(training_file)
    training_classes = Dataset.split(training_set)
    class_cnt = len(training_classes.keys())
    gamma = 1e-6
    C = 0.01
    svm = SimBinarySVMORI(gamma=gamma, C=C)

    def test_create_mapping(self):
        self.label_to_int, self.int_to_label = self.svm._create_mapping(
            self.training_classes)

    @pytest.mark.run(after='test_create_mapping')
    def test_create_tree(self):
        self.label_to_int, self.int_to_label = self.svm._create_mapping(
            self.training_classes)
        self.group_mgr = self.svm._create_tree(self.training_classes,
                                               self.label_to_int)

        def runner(current):
            if current.children == None:
                return

            child_universe = []
            for child in current.children:
                child_universe += list(child.universe.keys())
            assert set(current.universe.keys()) == set(child_universe)

            for child in current.children:
                runner(child)

        runner(next(iter(self.group_mgr.groups.values())))

    @pytest.mark.run(after='test_construct_tree')
    def test_train(self):
        group_mgr = self.svm.train(self.training_classes)

        def runner(current):
            if current.children == None:
                return

            assert current.svm
            for child in current.children:
                runner(child)

        runner(next(iter(group_mgr.groups.values())))

    @pytest.mark.run(after='test_train')
    def test_predict(self):
        group_mgr = self.svm.train(self.training_classes)
        errors = 0
        total = 0
        for class_name, class_samples in self.training_classes.items():
            for sample in class_samples:
                total += 1
                if self.svm.predict(sample) != class_name:
                    # wrong prediction
                    errors += 1
        # just to see the idea
        print('errors:', errors, ' total:', total)
        assert errors == 0

    @pytest.mark.run(after='test_predict')
    def test_cross_validate(self):
        group_mgr = self.svm.train(self.training_classes)
        # 10 folds validation
        res = self.svm.cross_validate(10, self.training_classes)
        # this just to get the idea
        assert res == 0

    def test_make_gram_matrix(self):
        gamma = 0.1
        vectors = []
        training_classes_with_idx = {}
        idx = 0
        for name, points in self.training_classes.items():
            this_class = training_classes_with_idx[name] = []
            for point in points:
                # give it an index
                vector = point.tolist()
                vector_with_idx = [idx] + vector
                idx += 1
                vectors.append(vector)
                this_class.append(vector_with_idx)
            training_classes_with_idx[name] = numpy.array(this_class)

        vectors = numpy.array(vectors)
        kernel = self.svm.make_gram_matrix(vectors, gamma)

        def original_kernel(a, b):
            import numpy

            return numpy.exp(-gamma * numpy.linalg.norm(a - b)**2)

        for class_name, samples in training_classes_with_idx.items():
            a = samples
            b = a[:].tolist()
            random.shuffle(b)
            b = numpy.array(b)

            for i in range(a.shape[0]):
                assert abs(
                    kernel(a[i], b[i]) -
                    original_kernel(a[i][1:], b[i][1:])) < 1e-5
import time
from treesvm.dataset import Dataset
from treesvm import SimBinarySVM

__author__ = 'phizaz'

def timer(func):
    start_time = time.process_time()
    func()
    return time.process_time() - start_time

# ('letter', 'datasets/letter/letter-train.txt', 'datasets/letter/letter-test.txt', lambda row: (row[1:], row[0]))
training_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/treesvm/datasets/letter/letter-train.txt'
# training_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/simbinarysvm/generated/generated.csv'
training_set = Dataset.load(training_file, adapter=lambda row: (row[1:], row[0]))
training_classes = Dataset.split(training_set)

testing_file = '/Users/phizaz/Dropbox/waseda-internship/svm-implementations/treesvm/datasets/letter/letter-test.txt'
testing_set = Dataset.load(testing_file, adapter=lambda row: (row[1:], row[0]))
testing_classes = Dataset.split(testing_set)

svm = SimBinarySVM(gamma=0.001, C=10, verbose=True)
def train():
    svm.train(training_classes)
print('training: %.4f' % (timer(train)))

result = None
def test():
    global result
    result = svm.test(testing_classes)
print('testing: %.4f' % (timer(test)))
    # ('pendigits', 'datasets/pendigits/pendigits.tra', 'datasets/pendigits/pendigits.tes', lambda row: (row[:-1], row[-1])),
    ("letter", "datasets/letter/letter-train.txt", "datasets/letter/letter-test.txt", lambda row: (row[1:], row[0]))
]

for training in training_files:
    project_name = training[0]
    print("working on project: ", project_name)

    # load dataset
    given_adapter = None
    if len(training) > 3:
        given_adapter = training[3]

    training_file = training[1]
    print("train: ", training_file)
    training_set = Dataset.load(training_file, adapter=given_adapter)
    training_classes = Dataset.split(training_set)

    testing_file = training[2]
    print("test:  ", testing_file)
    testing_set = Dataset.load(testing_file, adapter=given_adapter)
    testing_classes = Dataset.split(testing_set)

    best = {}
    avg = {}

    for each in (
        ("OAO", OAOSVM),
        ("OAA", OAASVM),
        ("SimMultiSVM", SimMultiSVM),
        # ('SimBinarySVM_ORI', SimBinarySVMORI),