Exemple #1
0
 def setUp(self) -> None:
     np.random.seed(1)
     self.sdf = StandardDataFormat(
         labels=['a', 'b'],
         timestamps=np.array([
             datetime(2019, 7, 2, 12, 0),
             datetime(2019, 7, 2, 12, 1),
             datetime(2019, 7, 2, 12, 2),
             datetime(2019, 7, 2, 12, 3),
             datetime(2019, 7, 2, 12, 4),
         ]),
         data=np.round(np.random.random((5, 2)) * 100)
     )
Exemple #2
0
    def _process2d(self,
                   processor_input: StandardDataFormat) -> StandardDataFormat:
        self.state = cast(skpp.OneHotEncoder, self.state)  # just for ide
        fields_input = [f['inputField'] for f in self.generate]
        fields_output = [f['outputField'] for f in self.generate]

        np_partial_data = ColumnSelector(columns=fields_input).process(
            processor_input=processor_input).data
        np_partial_data = _ensure_2d(np_partial_data)
        self._init_encoder()

        np_partial_data = self.state.transform(np_partial_data).toarray()
        partial_data_labels = []
        for ix, category in enumerate(self.state.categories_):
            for v in category.tolist():
                partial_data_labels.append(f"{fields_output[ix]}${v}")

        partial_data = StandardDataFormat(
            timestamps=processor_input.timestamps,
            data=np_partial_data,
            labels=partial_data_labels)

        merged_data = ColumnDropper(columns=fields_input).process(
            processor_input=processor_input)
        return merged_data.add_cols(partial_data)
Exemple #3
0
    def get(self) -> StandardDataFormat:
        if not AppConfig.get_config_or_default('general.enable_cache',
                                               default=True):
            self.logger.info("caching disabled")
            return self._get()

        import json
        import os
        logger = self.logger
        cache_id = hashlib.sha256(
            json.dumps(self.source_description,
                       sort_keys=True).encode('utf-8')).hexdigest()
        logger.info(
            "caching source is enabled. Cache-Id is {0}".format(cache_id))
        path_to_cache = os.path.join(AppConfig['general.dir_cache'],
                                     "cache_{0}".format(cache_id))
        logger.info(f"path for cache is: {path_to_cache}")
        if os.path.isfile(path_to_cache):
            logger.info(MSG_CACHED_VERSION_FOUND.format(path_to_cache))
            df = pd.read_pickle(path_to_cache)
            return StandardDataFormat.from_dataframe(df)

        else:
            os.makedirs(AppConfig['general.dir_cache'], exist_ok=True)
            logger.info(MSG_CACHED_VERSION_NOT_FOUND)
            data = self._get()
            data.to_dataframe().to_pickle(path_to_cache)
            return data
    def _fetch(self, _fields: List[Field]) -> StandardDataFormat:
        self.logger.info(
            f"getting data from time period {self.date_from.isoformat()} - {self.date_to.isoformat()}"
        )
        self.logger.info(
            f"using username for authentification {self.session.auth.username}"
        )
        series = []
        for field in _fields:
            self.logger.info(
                f"getting data for field: {field.name} aka. {field.alias}")
            url = self._get_url(sensor_id=field.name,
                                date_from=self.date_from,
                                date_to=self.date_to)
            self.logger.debug(f"download data from: {url}")
            serie = pd.read_json(self._get_data(url, field))
            serie = serie.set_index('timestamp')
            serie.index = serie.index.round("T")
            serie = serie.rename({"value": field.name}, axis=1)
            dups = serie.index.duplicated()
            if len(dups):
                self.logger.info(
                    f"dropping {len(dups)} duplicated (index) measurements from {field}"
                )
                serie = serie[~dups]
            series.append(serie)
            self.logger.debug(f"received rows: {len(serie)}")

        df = pd.concat(series, axis=1)
        return StandardDataFormat.from_dataframe(df)
    def test_aggregation_call(self):
        data = np.random.random((10, 4))
        data[:, 0] = np.arange(0, 10)
        data[:, 1] = np.arange(10, 20)
        data.flags.writeable = False

        expected_param = np.zeros((2, 9, 2))
        expected_param[0, :, 0] = np.arange(0, 9)
        expected_param[0, :, 1] = np.arange(10, 19)
        expected_param[1, :, 0] = np.arange(1, 10)
        expected_param[1, :, 1] = np.arange(11, 20)

        expected_data = np.full((10, 4 + 2), fill_value=np.nan)
        expected_data[:, :4] = data
        expected_data[-2:, 4:] = 9

        agg = DummyAggreagor(sequence=9,
                             generate=[
                                 InputOutputField(inputField='abc',
                                                  outputField='hello'),
                                 InputOutputField(inputField='xyz',
                                                  outputField='world'),
                             ])

        input_format = StandardDataFormat(
            timestamps=helper_data.generate_timestamps(10,
                                                       samples=data.shape[0]),
            labels=['abc', 'xyz', 'aaa', 'bbb'],
            data=data)
        result = agg.process(input_format)

        assert_array_equal(agg.grouped_data, expected_param)
        self.assertEqual(result.labels,
                         ['abc', 'xyz', 'aaa', 'bbb', 'hello', 'world'])
        assert_array_equal(result.data, expected_data)
Exemple #6
0
    def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat:
        ix = np.arange(processor_input.data.shape[0])
        np.random.shuffle(ix)

        return processor_input.modify_copy(
            timestamps=processor_input.timestamps[ix],
            data=processor_input.data[ix])
Exemple #7
0
    def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat:
        selector = LabelSelector(processor_input.labels).select(self.fields)
        data = processor_input.data.copy()
        for col_id in selector.indexes:
            selection = data[:, col_id]
            ix_nans = np.isnan(selection)
            data[ix_nans, col_id] = self.replacement

        return processor_input.modify_copy(data=data)
Exemple #8
0
 def _process2d(self,
                processor_input: StandardDataFormat) -> StandardDataFormat:
     time_extracted = self._extractor(
         pd.Series(processor_input.timestamps.copy())).reshape(-1, 1)
     data = np.hstack((processor_input.data, time_extracted))
     return StandardDataFormat(labels=processor_input.labels +
                               [self.output_field],
                               timestamps=processor_input.timestamps,
                               data=data)
Exemple #9
0
 def _process2d(self,
                processor_input: StandardDataFormat) -> StandardDataFormat:
     df = pd.DataFrame(data=processor_input.data)
     data = df.interpolate(method=self.method,
                           limit_direction='forward',
                           limit=self.threshold).values
     return StandardDataFormat(labels=processor_input.labels,
                               timestamps=processor_input.timestamps,
                               data=data)
Exemple #10
0
    def _process2d(self,
                   processor_input: StandardDataFormat) -> StandardDataFormat:
        # index having zero nan in row
        ix_valid = np.sum(np.isnan(processor_input.data), axis=1) == 0

        return StandardDataFormat(
            labels=processor_input.labels,
            timestamps=processor_input.timestamps[ix_valid],
            data=processor_input.data[ix_valid])
    def test_no_outliers(self):
        data = np.array([[10, 30], [12, 25]], dtype="float64")
        data.flags.writeable = False

        input_data = StandardDataFormat(
            labels=['a', 'b'],
            data=data,
            timestamps=helper_data.generate_timestamps(2, 2))
        # noinspection PyArgumentList
        configs: List[InputOutputLimits] = [InputOutputLimits(inputField="a")]
        result = OutlierRemover(generate=configs)._process2d(input_data)
        assert_array_equal(data, result.data)
    def test_min_outlier(self):
        data = np.array([[10, 30], [12, 25]], dtype="float64")
        data.flags.writeable = False

        input_data = StandardDataFormat(
            labels=['a', 'b'],
            data=data,
            timestamps=helper_data.generate_timestamps(2, 2))
        limits = [{'inputField': 'b', 'min': 28}]

        result_expected = data.copy()
        result_expected[1, 1] = np.nan

        result = OutlierRemover(generate=limits)._process2d(input_data)
        assert_array_equal(result_expected, result.data)
    def test_column_dropper(self):
        timestamps = np.arange(datetime(2019, 7, 2, 12, 0),
                               datetime(2019, 7, 2, 20, 0),
                               timedelta(minutes=15)).astype(datetime)
        timestamps.flags.writeable = False

        data = np.random.random((5, 3))
        data.flags.writeable = False

        processor_data = StandardDataFormat(
            labels=["preis", "temperatur", "feuchtigkeit"],
            timestamps=timestamps[:5],
            data=data)

        result_expected = StandardDataFormat(labels=["preis", "feuchtigkeit"],
                                             timestamps=timestamps[:5],
                                             data=processor_data.data[:,
                                                                      [0, 2]])

        result = ColumnDropper(
            columns=["temperatur"])._process2d(processor_data)
        self.assertListEqual(result_expected.labels, result.labels)
        assert_array_equal(result_expected.timestamps, result.timestamps)
        assert_array_equal(result_expected.data, result.data)
Exemple #14
0
    def test_standard_case(self):
        data = np.array([
            [23, 55],
            [21, 52],
            [np.nan, 52],
            [23, np.nan],
        ])
        data.flags.writeable = False

        result_excepted = data[:2]
        process_data = StandardDataFormat(
            data=data,
            labels=['a', 'b'],
            timestamps=helper_data.generate_timestamps(2, samples=4))
        result = NanRemover()._process2d(process_data)
        assert_array_equal(result_excepted, result.data)
    def test_standard_case(self):
        data = np.array([[11, 20], [12, 21], [13, 22], [14, 20], [15, 20],
                         [16, 20], [17, 20], [18, 20]],
                        dtype='float')

        result_expected = np.array(
            [[11, 20], [12, 21], [13, 22], [14, 20], [15, 20], [16, 20],
             [17, np.nan], [18, np.nan]],
            dtype='float')
        processor_data = StandardDataFormat(
            timestamps=helper_data.generate_timestamps(samples=data.shape[0]),
            labels=helper_data.get_labels(2),
            data=data)

        processor_data_result = FreezedValueRemover(
            max_freezed_values=3)._process2d(processor_data)
        assert_array_equal(result_expected, processor_data_result.data)
    def test_range_encoder(self):
        generates = [{'inputField': 'hour', 'outputField': 'hourOneHot'}]

        encoder = RangeEncoder(generate=generates, value_from=0, value_to=3)

        data = transform_to_2d_matrix(np.array([0, 1, 1, 2]))
        result_expected = np.zeros((4, 3))
        result_expected[0, 0] = 1
        result_expected[1, 1] = 1
        result_expected[2, 1] = 1
        result_expected[3, 2] = 1

        result = encoder._process2d(
            StandardDataFormat(data=data,
                               labels=['hour'],
                               timestamps=generate_timestamps(
                                   samples=result_expected.shape[0])))
        assert_array_equal(result_expected, result.data)
Exemple #17
0
class TestStandardDataFormat(unittest.TestCase):
    def setUp(self) -> None:
        np.random.seed(1)
        self.sdf = StandardDataFormat(
            labels=['a', 'b'],
            timestamps=np.array([
                datetime(2019, 7, 2, 12, 0),
                datetime(2019, 7, 2, 12, 1),
                datetime(2019, 7, 2, 12, 2),
                datetime(2019, 7, 2, 12, 3),
                datetime(2019, 7, 2, 12, 4),
            ]),
            data=np.round(np.random.random((5, 2)) * 100)
        )

    def test_to_dataframe(self):
        df = self.sdf.to_dataframe()
        self.assertEqual(self.sdf.labels, df.columns.values.tolist())
        assert_array_equal(self.sdf.timestamps, df.index.values)
        assert_array_equal(self.sdf.data, df.values)
Exemple #18
0
    def _process2d(self,
                   processor_input: StandardDataFormat) -> StandardDataFormat:
        partial_data = ColumnSelector(
            self.fields).process(processor_input).data
        transformer_restored = self.state
        if transformer_restored:
            partial_data = transformer_restored.transform(partial_data)
        else:
            transformer = create_instance(qualified_name=self.scaler,
                                          kwargs=self.kwargs)
            partial_data = transformer.fit_transform(partial_data)
            # note: state will be saved only during training process
            # each training has it unique identificator which is associated with state data
            self.state = transformer

        label_selection = LabelSelector(
            elements=processor_input.labels).select(self.fields)
        data = processor_input.data.copy()
        data[:, label_selection.indexes] = partial_data

        return processor_input.modify_copy(data=data)
    def get(self) -> StandardDataFormat:
        required_fields = self._get_fields()
        raw = self._fetch(required_fields)

        AbstractDatasourceAdapter._check_fields_availability(
            raw, required_fields, using_alias=self.source_returns_alias)

        labels_new = [f.alias for f in required_fields]

        if self.source_returns_alias:
            ix_selection = LabelSelector(elements=raw.labels).select(
                selection=[f.alias for f in required_fields]).indexes
        else:
            ix_selection = LabelSelector(elements=raw.labels).select(
                selection=[f.name for f in required_fields]).indexes

        # ensure ordering of columns are correct
        data = raw.data[:, ix_selection]

        return StandardDataFormat(timestamps=raw.timestamps,
                                  labels=labels_new,
                                  data=data)
Exemple #20
0
    def test_multi_aggregation(self):
        delta = timedelta(minutes=5)
        start_date = datetime(2019, 7, 1, 12, 1)
        end_date = start_date + 10 * delta

        data = StandardDataFormat(
            labels=["temp1", "temp2"],
            timestamps=np.arange(start_date, end_date, delta).astype(datetime),
            data=np.array([np.arange(0, 10),
                           np.arange(10, 20)]).T)
        data.timestamps.flags.writeable = False
        data.data.flags.writeable = False

        multi = MultiAggregation(
            sequence=5,  # todo: use sequence instead minutes
            instances=[
                Max(sequence=5,
                    generate=[{
                        "inputField": "temp1",
                        "outputField": "temp1Max"
                    }]),
                Min(sequence=5,
                    generate=[{
                        "inputField": "temp1",
                        "outputField": "temp1Min"
                    }])
            ])

        multi.instances[0].id = "0"
        multi.instances[1].id = "1"

        result = PipelineExecutor(pipeline=[multi]).execute(data=data)
        self.assertEqual(['temp1', 'temp2', 'temp1Max', 'temp1Min'],
                         result.labels)
        assert_array_equal(data.timestamps, result.timestamps)
        assert_array_equal(np.full((4, 2), fill_value=np.nan),
                           result.data[:4, [2, 3]])
        assert_array_equal(np.arange(4, 10), result.data[4:, 2])
        assert_array_equal(np.arange(6), result.data[4:, 3])
    def test_range_encoder_two_cols(self):
        generates = [{'inputField': 'hour', 'outputField': 'hourOneHot'}]

        encoder = RangeEncoder(generate=generates, value_from=0, value_to=3)

        data = np.array([[0, 1, 1, 2], [1, 2, 3, 4]]).T

        result_expected = np.zeros((4, 4))
        result_expected[0, 1 + 0] = 1
        result_expected[1, 1 + 1] = 1
        result_expected[2, 1 + 1] = 1
        result_expected[3, 1 + 2] = 1
        result_expected[:, 0] = np.array([1, 2, 3, 4])

        result = encoder._process2d(
            StandardDataFormat(data=data,
                               labels=['hour', 'abc'],
                               timestamps=generate_timestamps(
                                   samples=result_expected.shape[0])))
        assert_array_equal(result_expected, result.data)
        self.assertEqual("abc", result.labels[0])
        self.assertEqual("hourOneHot$0", result.labels[1])
        self.assertEqual("hourOneHot$1", result.labels[2])
        self.assertEqual("hourOneHot$2", result.labels[3])
Exemple #22
0
    def test_resampling(self):
        rows = 2
        timestamps = np.array([
            datetime(2019, 7, 2, 12, 0),
            datetime(2019, 7, 2, 12, 3),
        ])

        timestamps.flags.writeable = False

        data = np.random.random((rows, 3)) * 100
        data.flags.writeable = False

        processor_data = StandardDataFormat(
            labels=["temperatur", "feuchtigkeit", "preis"],
            data=data,
            timestamps=timestamps)

        result = Resampler(freq="1min")._process2d(processor_data)

        # expected rows: 0, 1, 2, 3 => 4 rows
        expected_timestamps = np.arange(datetime(2019, 7, 2, 12, 0),
                                        datetime(2019, 7, 2, 12, 4),
                                        timedelta(minutes=1))

        self.assertListEqual(result.labels,
                             ["temperatur", "feuchtigkeit", "preis"])
        self.assertTupleEqual(result.data.shape,
                              (expected_timestamps.shape[0], 3))

        for i in range(expected_timestamps.shape[0]):
            self.assertEqual(expected_timestamps[i], result.timestamps[i])

        assert_array_equal(data[0], result.data[0])
        assert_array_equal(data[-1], result.data[-1])
        assert_array_equal(result.data[1:-1], np.full((2, 3),
                                                      fill_value=np.nan))
Exemple #23
0
    def _process2d(self,
                   processor_input: StandardDataFormat) -> StandardDataFormat:
        fields_in = [
            ConfigReader.from_dict(g).get_or_error(
                key="inputField", context="OutlierRemover Config")
            for g in self.generate
        ]
        cols_selected = LabelSelector(elements=processor_input.labels).select(
            selection=fields_in).indexes

        # we expend feature-array to have a 3D-array with only one entry (of 2D-Array)
        grouped_data = np.ma.array(
            np.expand_dims(processor_input.data[:, cols_selected], axis=0))

        affected_index = Outlier(sequence=np.nan, generate=self.generate)\
            .affected_index(grouped_data=grouped_data)
        affected_index = np.squeeze(affected_index, axis=0)

        data = processor_input.data.copy()
        t = data[:, cols_selected]
        t[affected_index] = np.nan
        data[:, cols_selected] = t

        return processor_input.modify_copy(data=data)
 def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat:
     mask = get_freezed_value_mask(processor_input.data, self.max_freezed_values)
     data = processor_input.data.copy()
     data[mask] = np.nan
     return processor_input.modify_copy(data=data)
Exemple #25
0
import datetime
import unittest

import numpy as np
from numpy.testing import assert_array_equal

from mlpipe.processors.standard_data_format import StandardDataFormat
from mlpipe.processors.time_extractor import TimeExtractor

test_data = StandardDataFormat(
    timestamps=np.array([
        datetime.datetime(2019, 7, 2, 12, 25),  # tuesday
        datetime.datetime(2019, 8, 3, 20, 45),  # wednesday
    ]),
    data=np.arange(2).reshape(-1, 1),
    labels=["example_data"])


class TestTimeExtractor(unittest.TestCase):
    def test_extract_hours(self):
        result_expected = np.hstack(
            (test_data.data, np.array([12, 21]).reshape(-1, 1)))
        result = TimeExtractor(
            extract="hour", outputField="example_hour")._process2d(test_data)
        assert_array_equal(result_expected, result.data)

    def test_extract_weekday(self):
        # note weekday start from monday. 0 = monday
        result_expected = np.hstack(
            (test_data.data, np.array([1, 5]).reshape(-1, 1)))
        result = TimeExtractor(