Exemple #1
0
class SampleTrainValTestSet(unittest.TestCase):

    def setUp(self):
        f, self.true_proportions = create_skewed_CSV()
        self.K = len(self.true_proportions)
        self.bd = BlockDesigner(f, self.K)
        self.samp = Sampler(self.bd.remainder())

    def test_all_classes(self):
        for test_klass in range(self.bd.K):
            X, y = self.samp.custom_distribution(test_klass, 128)
            self.failUnless(
                (len(y) == len(X)) and (len(X) % 128 == 0)
            )

    def test_cycles_through_all_data(self):
        X, y = self.samp.custom_distribution(0, 128)
        X2, y2 = self.samp.custom_distribution(0, 128)
        self.failUnless(
            len(set(X+X2)) == sum(ACTUAL_TRAIN_DR_PROPORTIONS)
        )

    def test_custom_distribution(self):
        X, y = self.samp.custom_distribution(0, 128, [94,9,19,3,3])

        collect = {}
        for k in set(y):
            collect[k] = []
        for i, klass in enumerate(y):
            collect[klass].append(X[i])

        self.failUnless(
            sum(abs(get_proportions(collect) - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K
        )

    def test_skipping_classes(self):
        X, y = self.samp.custom_distribution(0, 128, [64,64,0,0,0])

        collect = {}
        for k in set(y):
            collect[k] = []
        for i, klass in enumerate(y):
            collect[klass].append(X[i])

        self.failUnless(
            sum(abs(get_proportions(collect) - numpy.array([0.5, 0.5])) < PROPORTION_ERROR_MARGIN) == 2
        )
Exemple #2
0
class SampleTrainValTestSet(unittest.TestCase):
    def setUp(self):
        f, self.true_proportions = create_skewed_CSV()
        self.K = len(self.true_proportions)
        self.bd = BlockDesigner(f, self.K)
        self.samp = Sampler(self.bd.remainder())

    def test_all_classes(self):
        for test_klass in range(self.bd.K):
            X, y = self.samp.custom_distribution(test_klass, 128)
            self.failUnless((len(y) == len(X)) and (len(X) % 128 == 0))

    def test_cycles_through_all_data(self):
        X, y = self.samp.custom_distribution(0, 128)
        X2, y2 = self.samp.custom_distribution(0, 128)
        self.failUnless(len(set(X + X2)) == sum(ACTUAL_TRAIN_DR_PROPORTIONS))

    def test_custom_distribution(self):
        X, y = self.samp.custom_distribution(0, 128, [94, 9, 19, 3, 3])

        collect = {}
        for k in set(y):
            collect[k] = []
        for i, klass in enumerate(y):
            collect[klass].append(X[i])

        self.failUnless(
            sum(
                abs(get_proportions(collect) - self.true_proportions) <
                PROPORTION_ERROR_MARGIN) == self.K)

    def test_skipping_classes(self):
        X, y = self.samp.custom_distribution(0, 128, [64, 64, 0, 0, 0])

        collect = {}
        for k in set(y):
            collect[k] = []
        for i, klass in enumerate(y):
            collect[klass].append(X[i])

        self.failUnless(
            sum(
                abs(get_proportions(collect) -
                    numpy.array([0.5, 0.5])) < PROPORTION_ERROR_MARGIN) == 2)
Exemple #3
0
class CreateTrainValTestSet(unittest.TestCase):
    def setUp(self):
        f, self.true_proportions = create_skewed_CSV()
        self.K = len(self.true_proportions)
        self.bd = BlockDesigner(f, self.K)

    def get_counts(self, dataset):
        return numpy.array(
            [len(dataset[klass]) for klass in reversed(xrange(self.K))])

    def test_instantiating_and_splitting_multiple_times(self):
        valid_dataset = self.bd.break_off_block(4864)
        train_dataset = self.bd.remainder()
        train_batches_to_take = self.bd.size() // 128

        bd2 = BlockDesigner(train_dataset)
        batches2 = bd2.break_off_multiple_blocks(train_batches_to_take, 128)
        bd3 = BlockDesigner(train_dataset)
        batches3 = bd3.break_off_multiple_blocks(train_batches_to_take, 128)

        ideal_counts = numpy.array(
            [int(128 * p) for p in self.true_proportions])

        for i in xrange(len(batches2)):
            counts = self.get_counts(batches2[i])
            self.failUnless(sum(counts) == 128)
            self.failUnless(
                sum(abs(self.get_counts(batches2[i]) -
                        ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN)

            counts = self.get_counts(batches3[i])
            self.failUnless(sum(counts) == 128)
            self.failUnless(
                sum(abs(self.get_counts(batches3[i]) -
                        ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN)

    def test_small_blocks_for_consistency(self):
        valid_dataset = self.bd.break_off_block(4864)

        bd2 = BlockDesigner(valid_dataset)
        batches = bd2.break_off_multiple_blocks(int(4864 / 128.), 128)

        ideal_counts = numpy.array(
            [int(128 * p) for p in self.true_proportions])

        self.failUnless(bd2.size() == 0)
        for i in xrange(len(batches)):
            counts = self.get_counts(batches[i])
            self.failUnless(sum(counts) == 128)
            self.failUnless(
                sum(abs(self.get_counts(batches[i]) -
                        ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN)

    def test_no_test_set(self):
        valid_dataset = self.bd.break_off_block(4864)
        train_dataset = self.bd.remainder()
        self.failUnless(
            sum(
                self.get_counts(valid_dataset) +
                self.get_counts(train_dataset)) == self.bd.init_size)

        valid_proportions = get_proportions(valid_dataset)
        train_proportions = get_proportions(train_dataset)

        self.failUnless(
            sum(
                abs(valid_proportions - self.true_proportions) <
                PROPORTION_ERROR_MARGIN) == self.K)
        self.failUnless(
            sum(
                abs(train_proportions - self.true_proportions) <
                PROPORTION_ERROR_MARGIN) == self.K)

    def test_all_sets(self):
        test_dataset = self.bd.break_off_block(1024)
        valid_dataset = self.bd.break_off_block(4864)
        train_dataset = self.bd.remainder()
        self.failUnless(
            sum(
                self.get_counts(test_dataset) +
                self.get_counts(valid_dataset) +
                self.get_counts(train_dataset)) == self.bd.init_size)

        test_proportions = get_proportions(test_dataset)
        valid_proportions = get_proportions(valid_dataset)
        train_proportions = get_proportions(train_dataset)

        self.failUnless(
            sum(
                abs(test_proportions - self.true_proportions) <
                PROPORTION_ERROR_MARGIN) == self.K)
        self.failUnless(
            sum(
                abs(valid_proportions - self.true_proportions) <
                PROPORTION_ERROR_MARGIN) == self.K)
        self.failUnless(
            sum(
                abs(train_proportions - self.true_proportions) <
                PROPORTION_ERROR_MARGIN) == self.K)
class CreateTrainValTestSet(unittest.TestCase):

    def setUp(self):
        f, self.true_proportions = create_skewed_CSV()
        self.K = len(self.true_proportions)
        self.bd = BlockDesigner(f, self.K)

    def get_counts(self, dataset):
        return numpy.array([len(dataset[klass]) for klass in reversed(xrange(self.K))])

    def test_instantiating_and_splitting_multiple_times(self):
        valid_dataset = self.bd.break_off_block(4864)
        train_dataset = self.bd.remainder()
        train_batches_to_take = self.bd.size() // 128

        bd2 = BlockDesigner(train_dataset)
        batches2 = bd2.break_off_multiple_blocks(train_batches_to_take, 128)
        bd3 = BlockDesigner(train_dataset)
        batches3 = bd3.break_off_multiple_blocks(train_batches_to_take, 128)

        ideal_counts = numpy.array([int(128 * p) for p in self.true_proportions])

        for i in xrange(len(batches2)):
            counts = self.get_counts(batches2[i])
            self.failUnless(
                sum(counts) == 128
            )
            self.failUnless(
                sum(abs(self.get_counts(batches2[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN
            )

            counts = self.get_counts(batches3[i])
            self.failUnless(
                sum(counts) == 128
            )
            self.failUnless(
                sum(abs(self.get_counts(batches3[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN
            )

    def test_small_blocks_for_consistency(self):
        valid_dataset = self.bd.break_off_block(4864)

        bd2 = BlockDesigner(valid_dataset)
        batches = bd2.break_off_multiple_blocks(int(4864 / 128.), 128)

        ideal_counts = numpy.array([int(128 * p) for p in self.true_proportions])

        self.failUnless(
            bd2.size() == 0
        )
        for i in xrange(len(batches)):
            counts = self.get_counts(batches[i])
            self.failUnless(
                sum(counts) == 128
            )
            self.failUnless(
                sum(abs(self.get_counts(batches[i]) - ideal_counts)) < SAMPLE_COUNT_ERROR_MARGIN
            )

    def test_no_test_set(self):
        valid_dataset = self.bd.break_off_block(4864)
        train_dataset = self.bd.remainder()
        self.failUnless(
            sum(self.get_counts(valid_dataset) + self.get_counts(train_dataset)) == self.bd.init_size
        )

        valid_proportions = get_proportions(valid_dataset)
        train_proportions = get_proportions(train_dataset)

        self.failUnless(
            sum(abs(valid_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K
        )
        self.failUnless(
            sum(abs(train_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K
        )

    def test_all_sets(self):
        test_dataset = self.bd.break_off_block(1024)
        valid_dataset = self.bd.break_off_block(4864)
        train_dataset = self.bd.remainder()
        self.failUnless(
            sum(self.get_counts(test_dataset) + self.get_counts(valid_dataset) + self.get_counts(train_dataset)) == self.bd.init_size
        )

        test_proportions = get_proportions(test_dataset)
        valid_proportions = get_proportions(valid_dataset)
        train_proportions = get_proportions(train_dataset)

        self.failUnless(
            sum(abs(test_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K
        )
        self.failUnless(
            sum(abs(valid_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K
        )
        self.failUnless(
            sum(abs(train_proportions - self.true_proportions) < PROPORTION_ERROR_MARGIN) == self.K
        )