Ejemplo n.º 1
0
class TestCData(unittest.TestCase):
    """
    Dear Categorical Dataframe,

    I would like you to:
    + hold categorical data for me.
    + partition the data to learning and testing cases
    + be able to generate weights based on the representation ratio of different classes
    + transform (whiten, autoencode, standardize) the independent variables
     and adjust the <inputs_required> accordingly.
     These transformations should fitted only on the learning data!
    + dummycode/embed the categorical variable:
     create the one-hot vector representations of categories OR
     embed the categorical variable into N-space,
     adjust <outputs_required> accordingly,
     and be able to translate the network output back to human readable class names
    + be able to reset transformations and embeddings if this is desirable
     without the loss of information.
    + create a learning table from the data
    + generate random batches from the data
    - Handle multiple labels and be able to average similarily labelled samples
    """
    def setUp(self):
        self.X_, self.y_, headers = parse_csv(etalonroot + "/input.csv")

        self.data = CData((self.X_, self.y_), cross_val=0)

    def test_initialization_on_etalon_with_given_parameters(self):
        new_data = CData((self.X_, self.y_), cross_val=0.5)
        self.assertEqual(
            new_data.embedding, "onehot",
            "<embedding> property is faulty after initialization!")
        self.assertEqual(
            len(new_data.categories), 3,
            "Invalid determination of categories! (got {})".format(
                new_data.categories))
        self.assertEqual(new_data.crossval, 0.5,
                         "Wrong <crossval> value in data!")
        self.assertEqual(new_data.N, 5)
        self.assertEqual(new_data.N, new_data.learning.shape[0],
                         "Validation data splitting went wrong @ learning!")
        self.assertEqual(new_data.n_testing, 5)
        self.assertEqual(new_data.n_testing, new_data.testing.shape[0],
                         "Validation data splitting went wrong @ testing!")

    def test_reset(self):
        data2 = CData((self.X_, self.y_), cross_val=0)
        er = "Difference detected in data shapes"

        self.data.crossval = 3
        self.data.embedding = 10
        self.data.transformation = ("pca", 1)

        self.data.reset_data(shuff=False)
        sm1, sm2 = np.sum(self.data.data), np.sum(data2.data)

        self.assertEqual(self.data.learning.shape, (7, 3), msg=er)
        self.assertEqual(
            sm1,
            sm2,
            msg="The sums of learning data differ by {}!\n{}\n{}".format(
                abs(sm1 - sm2), sm1, sm2))

    def test_core_data_is_readonly(self):
        with self.assertRaises(ValueError):
            self.data.data[0][0] = 2.0

    def test_setter_sets_crossval_getter_right(self):
        self.data.crossval = 5
        self.assertEqual(self.data.crossval, 0.5,
                         "Wrong <crossval> value in data!")
        self.assertEqual(self.data.N, 5)
        self.assertEqual(self.data.N, self.data.learning.shape[0],
                         "Validation data splitting went wrong @ learning!")
        self.assertEqual(self.data.n_testing, 5)
        self.assertEqual(self.data.n_testing, self.data.testing.shape[0],
                         "Validation data splitting went wrong @ testing!")

    def test_weights_sum_to_N(self):
        w = self.data.sample_weights
        self.assertEqual(round(w.sum()), self.data.N)

    def test_concatenate_produces_right_shape(self):
        newdata = CData((self.X_, self.y_), cross_val=0)
        newdata.concatenate(self.data)
        self.assertEqual(newdata.N, 20,
                         "Split after concatenation went wrong!")
        self.assertEqual(newdata.data.shape, (20, 3),
                         "Shapes went haywire after concatenation!")

    def test_batches_from_generator_are_shaped_and_distributed_right(self):
        i = 0
        self.data.crossval = 2
        self.data.transformation = ("ae", 15)
        self.data.embedding = 3
        for X, y, w in self.data.batchgen(2, weigh=True):
            self.assertEqual(X.shape, (2, 15))
            self.assertEqual(y.shape, (2, 3))
            self.assertEqual(w.shape, (2, ))
            i += 1
        self.assertEqual(
            i,
            4,
            msg="Number of batches differ. Got {} expected {}".format(i, 4))

    def test_neurons_required_property_on_untransformed_data(self):
        inshape, outputs = self.data.neurons_required
        self.assertIsInstance(inshape, tuple, "Input shape is not in a tuple!")
        self.assertIsInstance(inshape, tuple,
                              "Output shape is not in a tuple!")
        self.assertEqual(inshape, (3, ))
        self.assertEqual(outputs, (3, ))

    def test_neurons_required_proprety_after_heavy_transformation_then_resetting(
            self):
        self.data.embedding = 10
        self.data.transformation = ("pca", 2)
        inshape, outputs = self.data.neurons_required
        self.assertIsInstance(inshape, tuple, "Input shape is not in a tuple!")
        self.assertIsInstance(outputs, tuple,
                              "Output shape is not in a tuple!")
        self.assertEqual(inshape, (2, ),
                         "Wrong input shape after transformation/embedding!")
        self.assertEqual(outputs, (10, ),
                         "Wrong output shape after transformation/embedding!")
        self.data.reset_data(shuff=False)
        inshape, outputs = self.data.neurons_required
        self.assertIsInstance(inshape, tuple, "Input shape is not in a tuple!")
        self.assertIsInstance(outputs, tuple,
                              "Output shape is not in a tuple!")
        self.assertEqual(inshape, (3, ), "Wrong input shape after resetting!")
        self.assertEqual(outputs, (3, ), "Wrong output shape after resetting!")
Ejemplo n.º 2
0
from csxdata import CData, roots

from brainforge import BackpropNetwork
from brainforge.layers import ConvLayer, PoolLayer, Flatten, DenseLayer, Activation
from brainforge.optimization import RMSprop

data = CData(roots["misc"] + "mnist.pkl.gz", cross_val=10000, fold=True)
ins, ous = data.neurons_required
net = BackpropNetwork(input_shape=ins, layerstack=[
    ConvLayer(3, 8, 8, compiled=False),
    PoolLayer(3, compiled=False), Activation("tanh"),
    Flatten(), DenseLayer(60, activation="tanh"),
    DenseLayer(ous, activation="softmax")
], cost="xent", optimizer=RMSprop(eta=0.01))

net.fit_generator(data.batchgen(bsize=20, infinite=True), lessons_per_epoch=60000, epochs=30,
                  validation=data.table("testing"))