Esempio n. 1
0
    def test_order_and_elements_presence(self):
        """
        Checks whether shuffler changes the order, and does not eliminate
        elements.
        """
        data_chunk_sizes = [30, 10, 100, 320]
        data_attrs_numbers = [2, 1, 3, 15]

        for data_chunk_size, data_attrs_number in product(data_chunk_sizes,
                                                          data_attrs_numbers):
            data_chunk = generate_data_chunk(data_attrs_number, data_chunk_size)
            original_data_chunk = copy.deepcopy(data_chunk)
            shuffler = Shuffler()

            # Checking if the order is actually broken for desired fields/attrs,
            # and all data-units are preserved for the shuffled fields
            shuffled_data_chunk = shuffler(data_chunk)

            for attr in shuffled_data_chunk.keys():
                res = isclose(original_data_chunk[attr],
                              shuffled_data_chunk[attr])
                self.assertFalse(res.all())

                res = isclose(sorted(original_data_chunk[attr]),
                              sorted(shuffled_data_chunk[attr]))
                self.assertTrue(res.all())
Esempio n. 2
0
    def test_order(self):
        """Testing production of chunks in a different order from the stream."""
        data_sizes = [200, 545]
        data_attrs_numbers = [5, 8, 2, 1, 15]
        inp_chunk_sizes = [1, 2, 3, 4, 5]
        buffer_sizes = [2, 38, 1000]

        for data_size, data_attrs_number, buffer_size, input_chunk_size in \
                itertools.product(data_sizes, data_attrs_numbers, buffer_sizes,
                                  inp_chunk_sizes):
            data = generate_data_chunk(data_attrs_number, data_size)
            inp_data_chunks = create_list_of_data_chunks(
                data, input_chunk_size)

            chunk_collector = ChunkShuffler(buffer_size=buffer_size)
            accum = ChunkAccumulator(collector=chunk_collector)

            actual_chunks = []

            for actual_chunk in accum.iter(inp_data_chunks):
                actual_chunks.append(actual_chunk)
            actual_ds = concat_chunks(*actual_chunks)

            self.assertTrue(data != actual_ds)
            self.assertTrue(len(data) == len(actual_ds))
Esempio n. 3
0
    def test_chunks_with_different_value_array_sizes(self):
        chunk_size = 100
        attrs_numbers = [2, 3, 10, 25, 9]

        for attrs_number in attrs_numbers:
            chunk = generate_data_chunk(attrs_number, chunk_size)
            attr_to_alter = np.random.choice(list(chunk.keys()), 1)[0]
            chunk[attr_to_alter] = chunk[attr_to_alter][:-1]
            with self.assertRaises(DataChunkError):
                chunk.validate()
Esempio n. 4
0
    def test_valid_chunks(self):
        chunk_sizes = [1, 20, 12, 1023, 100]
        attrs_numbers = [1, 3, 10, 25, 9]

        for attrs_number, chunk_size in product(attrs_numbers, chunk_sizes):
            good_chunk = generate_data_chunk(attrs_number, chunk_size)
            try:
                good_chunk.validate()
            except Exception:
                raise self.assertTrue(False)
Esempio n. 5
0
    def test_chunks_with_wrong_value_types_in_constr(self):
        """Testing if an error is thrown for invalid chunk value types"""
        chunk_size = 100
        attrs_numbers = [2, 3, 10]
        invalid_values = ["dummy_val", [1231, 123123, 12], (), object, 1.23]

        for attrs_number, invalid_val in product(attrs_numbers, invalid_values):
            chunk = generate_data_chunk(attrs_number, chunk_size)
            attr_to_alter = np.random.choice(list(chunk.keys()), 1)[0]
            with self.assertRaises(DataChunkError):
                chunk[attr_to_alter] = invalid_val
                chunk.validate()
Esempio n. 6
0
    def test_output(self):
        """Testing whether it produces a valid output."""
        new_fname = "DUMMY"
        dc = generate_data_chunk(10, 1000)
        th = 0.1
        dtypes = ['int64', 'int32', 'float32', 'float64', 'bool']
        key_fname = list(dc.keys())[0]
        eval_func = lambda x: x[key_fname] > th

        for dtype in dtypes:
            exp_fvals = (dc[key_fname] > th).astype(dtype)
            dum = DataUnitMarker(new_fname=new_fname,
                                 eval_func=eval_func,
                                 dtype=dtype)
            dc = dum(dc)
            self.assertTrue((dc[new_fname] == exp_fvals).all())
    def test_chunk_size_adjustment_with_random_data_and_params(self):
        """Default scenario when only the size of chunks is adjusted."""
        data_sizes = [100, 102, 54, 35]
        data_attrs_numbers = [5, 8, 2, 1, 15]
        inp_chunk_sizes = [10, 15, 63, 1, 2]
        batch_sizes = [1, 2, 38, 1000]

        for data_size, data_attrs_number, batch_size, input_chunk_size in \
                itertools.product(data_sizes, data_attrs_numbers, batch_sizes,
                                  inp_chunk_sizes):
            data = generate_data_chunk(data_attrs_number, data_size)
            expected_batches = create_list_of_data_chunks(data, batch_size)
            inp_data_chunks = create_list_of_data_chunks(data, input_chunk_size)

            chunk_collector = UnitCollector(max_size=batch_size)
            batcher = ChunkAccumulator(collector=chunk_collector)

            indx = 0
            for actual_chunk in batcher.iter(inp_data_chunks):
                expected_batch = expected_batches[indx]
                self.assertTrue(actual_chunk == expected_batch)
                indx += 1
            self.assertEqual(len(expected_batches), indx)
Esempio n. 8
0
    def test_output(self):
        data_size = 1234
        data_attrs_number = 15
        input_chunks_size = 10
        transform_attrs_number = 10

        functions = [lambda x: np.log(abs(x) + 1), lambda x: np.exp(x),
                     lambda x: x**2]
        data = generate_data_chunk(data_attrs_number, data_size)
        transform_attrs = list(data.keys())[:transform_attrs_number]
        input_data_chunks = create_list_of_data_chunks(data, input_chunks_size)

        for func in functions:
            function_applier = FunctionApplier({a:func for a in transform_attrs})
            for input_data_chunk in input_data_chunks:
                actual_chunk = function_applier(input_data_chunk)
                expected_chunk = input_data_chunk

                # transforming manually values of input data-chunks
                for transform_attr in transform_attrs:
                    expected_chunk[transform_attr] = \
                        func(expected_chunk[transform_attr])

                self.assertTrue(actual_chunk == expected_chunk)