Esempio n. 1
0
class TestEmbedding(unittest.TestCase):
    """
    Dear Embedding Wrapper Classes,

    I would like you to:
    + create embeddings from categories
    ++ create OneHot embedding
    ++ create random embedding into n dimensions
    + transfrom any category label into the appropriate embedding
    - translate an embedding back to readable label or dummycode
    """
    def setUp(self):
        self.X_, self.y_, headers = parse_csv(etalonroot + "/input.csv")

        self.data = CData((self.X_, self.y_), cross_val=0)

    def test_embedding_then_reverting_to_onehot_doesnt_break_shapes(self):
        self.data.reset_data(shuff=False)
        self.data.crossval = 0
        self.data.embedding = 10
        self.assertEqual(
            self.data.embedding, "embedding",
            "<embedding> setter is faulty! (got {})".format(
                self.data.embedding))
        X, y = self.data.table()
        self.assertEqual(
            y.shape, (10, 10),
            "Embedding of independent variables went wrong! (got shape {})".
            format(y.shape))

        del self.data.embedding
        self.assertEqual(
            self.data.embedding, "onehot",
            "<embedding> deleter is faulty! (got {})".format(
                self.data.embedding))
        X, y = self.data.table()
        self.assertEqual(
            y.shape, (10, 3),
            "OneHot of independent variables went wrong! (got shape {})".
            format(y.shape))
Esempio n. 2
0
class TestTransformations(unittest.TestCase):
    """
    Dear Transformation Wrapper Classes,

    I would like you to:

    """
    def setUp(self):
        self.X_, self.y_, headers = parse_csv(etalonroot + "/input.csv")

        self.data = CData((self.X_, self.y_), cross_val=0)

    def test_standardization_on_etalon(self):
        self.data.reset_data(shuff=False)

        calcme = parse_csv(etalonroot + "std.csv", dtype="float64")[0]
        calcme = np.sort(calcme.ravel())

        self.data.transformation = "std"
        X = np.round(self.data.learning.astype("float64"), 3)
        X = np.sort(X.ravel())

        self.assertEqual(self.data.transformation, "std",
                         "The transformation property is faulty!")
        self.assertTrue(np.all(np.equal(X, calcme)),
                        "Standardization is faulty!")

    def test_pca_on_etalon(self):
        self.data.reset_data(shuff=False)

        calcme = parse_csv(etalonroot + "pca.csv", dtype="float64")[0]
        calcme = np.round(np.sort(np.abs(calcme.ravel())), 1)

        self.data.transformation = "pca"
        X = self.data.learning.astype("float64")
        X = np.round(np.sort(np.abs(X.ravel())), 1)
        eq = np.isclose(X, calcme)

        self.assertEqual(self.data.transformation, "pca",
                         "The transformation property is faulty!")
        self.assertTrue(np.all(eq), "PCA is faulty!")

    def test_lda_on_etalon(self):
        self.data.reset_data(shuff=False)

        calcme = parse_csv(etalonroot + "lda.csv", dtype="float64")[0]
        calcme = np.round(np.sort(np.abs(calcme.ravel())), 1)

        self.data.transformation = "lda"
        X = self.data.learning.astype("float64")
        X = np.round(np.sort(np.abs(X.ravel())), 1)
        eq = np.isclose(X, calcme)

        self.assertEqual(self.data.transformation, "lda",
                         "The transformation property is faulty!")
        self.assertTrue(np.all(eq), "LDA is faulty!")

    def test_ica_on_etalon(self):
        self.data.reset_data(shuff=False)

        calcme = parse_csv(etalonroot + "ica.csv", dtype="float64")[0]
        calcme = np.round(np.sort(np.abs(calcme.ravel())), 1)

        self.data.transformation = "ica"
        X = self.data.learning.astype("float64")
        X = np.round(np.sort(np.abs(X.ravel())), 1)

        self.assertEqual(self.data.transformation, "ica",
                         "The transformation property is faulty!")
        self.assertTrue(np.allclose(X, calcme, rtol=1.e-4, atol=1.e-7),
                        "ICA is faulty!")

    def test_autoencoding_on_etalon(self):
        self.data.reset_data(shuff=False)
        self.data.transformation = ("ae", 10)
        self.assertEqual(
            self.data.transformation, "autoencoding",
            "Autoencoding failed on the <transformation> property assertion!")
        self.assertEqual(self.data.learning.shape, (10, 10),
                         "Autoencoding failed on the output shape test!")
Esempio n. 3
0
class TestCData(unittest.TestCase):
    """
    Dear Categorical Dataframe,

    I would like you to:
    + hold categorical data for me.
    + partition the data to learning and testing cases
    + be able to generate weights based on the representation ratio of different classes
    + transform (whiten, autoencode, standardize) the independent variables
     and adjust the <inputs_required> accordingly.
     These transformations should fitted only on the learning data!
    + dummycode/embed the categorical variable:
     create the one-hot vector representations of categories OR
     embed the categorical variable into N-space,
     adjust <outputs_required> accordingly,
     and be able to translate the network output back to human readable class names
    + be able to reset transformations and embeddings if this is desirable
     without the loss of information.
    + create a learning table from the data
    + generate random batches from the data
    - Handle multiple labels and be able to average similarily labelled samples
    """
    def setUp(self):
        self.X_, self.y_, headers = parse_csv(etalonroot + "/input.csv")

        self.data = CData((self.X_, self.y_), cross_val=0)

    def test_initialization_on_etalon_with_given_parameters(self):
        new_data = CData((self.X_, self.y_), cross_val=0.5)
        self.assertEqual(
            new_data.embedding, "onehot",
            "<embedding> property is faulty after initialization!")
        self.assertEqual(
            len(new_data.categories), 3,
            "Invalid determination of categories! (got {})".format(
                new_data.categories))
        self.assertEqual(new_data.crossval, 0.5,
                         "Wrong <crossval> value in data!")
        self.assertEqual(new_data.N, 5)
        self.assertEqual(new_data.N, new_data.learning.shape[0],
                         "Validation data splitting went wrong @ learning!")
        self.assertEqual(new_data.n_testing, 5)
        self.assertEqual(new_data.n_testing, new_data.testing.shape[0],
                         "Validation data splitting went wrong @ testing!")

    def test_reset(self):
        data2 = CData((self.X_, self.y_), cross_val=0)
        er = "Difference detected in data shapes"

        self.data.crossval = 3
        self.data.embedding = 10
        self.data.transformation = ("pca", 1)

        self.data.reset_data(shuff=False)
        sm1, sm2 = np.sum(self.data.data), np.sum(data2.data)

        self.assertEqual(self.data.learning.shape, (7, 3), msg=er)
        self.assertEqual(
            sm1,
            sm2,
            msg="The sums of learning data differ by {}!\n{}\n{}".format(
                abs(sm1 - sm2), sm1, sm2))

    def test_core_data_is_readonly(self):
        with self.assertRaises(ValueError):
            self.data.data[0][0] = 2.0

    def test_setter_sets_crossval_getter_right(self):
        self.data.crossval = 5
        self.assertEqual(self.data.crossval, 0.5,
                         "Wrong <crossval> value in data!")
        self.assertEqual(self.data.N, 5)
        self.assertEqual(self.data.N, self.data.learning.shape[0],
                         "Validation data splitting went wrong @ learning!")
        self.assertEqual(self.data.n_testing, 5)
        self.assertEqual(self.data.n_testing, self.data.testing.shape[0],
                         "Validation data splitting went wrong @ testing!")

    def test_weights_sum_to_N(self):
        w = self.data.sample_weights
        self.assertEqual(round(w.sum()), self.data.N)

    def test_concatenate_produces_right_shape(self):
        newdata = CData((self.X_, self.y_), cross_val=0)
        newdata.concatenate(self.data)
        self.assertEqual(newdata.N, 20,
                         "Split after concatenation went wrong!")
        self.assertEqual(newdata.data.shape, (20, 3),
                         "Shapes went haywire after concatenation!")

    def test_batches_from_generator_are_shaped_and_distributed_right(self):
        i = 0
        self.data.crossval = 2
        self.data.transformation = ("ae", 15)
        self.data.embedding = 3
        for X, y, w in self.data.batchgen(2, weigh=True):
            self.assertEqual(X.shape, (2, 15))
            self.assertEqual(y.shape, (2, 3))
            self.assertEqual(w.shape, (2, ))
            i += 1
        self.assertEqual(
            i,
            4,
            msg="Number of batches differ. Got {} expected {}".format(i, 4))

    def test_neurons_required_property_on_untransformed_data(self):
        inshape, outputs = self.data.neurons_required
        self.assertIsInstance(inshape, tuple, "Input shape is not in a tuple!")
        self.assertIsInstance(inshape, tuple,
                              "Output shape is not in a tuple!")
        self.assertEqual(inshape, (3, ))
        self.assertEqual(outputs, (3, ))

    def test_neurons_required_proprety_after_heavy_transformation_then_resetting(
            self):
        self.data.embedding = 10
        self.data.transformation = ("pca", 2)
        inshape, outputs = self.data.neurons_required
        self.assertIsInstance(inshape, tuple, "Input shape is not in a tuple!")
        self.assertIsInstance(outputs, tuple,
                              "Output shape is not in a tuple!")
        self.assertEqual(inshape, (2, ),
                         "Wrong input shape after transformation/embedding!")
        self.assertEqual(outputs, (10, ),
                         "Wrong output shape after transformation/embedding!")
        self.data.reset_data(shuff=False)
        inshape, outputs = self.data.neurons_required
        self.assertIsInstance(inshape, tuple, "Input shape is not in a tuple!")
        self.assertIsInstance(outputs, tuple,
                              "Output shape is not in a tuple!")
        self.assertEqual(inshape, (3, ), "Wrong input shape after resetting!")
        self.assertEqual(outputs, (3, ), "Wrong output shape after resetting!")