def setUp(self) -> None: np.random.seed(1) self.sdf = StandardDataFormat( labels=['a', 'b'], timestamps=np.array([ datetime(2019, 7, 2, 12, 0), datetime(2019, 7, 2, 12, 1), datetime(2019, 7, 2, 12, 2), datetime(2019, 7, 2, 12, 3), datetime(2019, 7, 2, 12, 4), ]), data=np.round(np.random.random((5, 2)) * 100) )
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: self.state = cast(skpp.OneHotEncoder, self.state) # just for ide fields_input = [f['inputField'] for f in self.generate] fields_output = [f['outputField'] for f in self.generate] np_partial_data = ColumnSelector(columns=fields_input).process( processor_input=processor_input).data np_partial_data = _ensure_2d(np_partial_data) self._init_encoder() np_partial_data = self.state.transform(np_partial_data).toarray() partial_data_labels = [] for ix, category in enumerate(self.state.categories_): for v in category.tolist(): partial_data_labels.append(f"{fields_output[ix]}${v}") partial_data = StandardDataFormat( timestamps=processor_input.timestamps, data=np_partial_data, labels=partial_data_labels) merged_data = ColumnDropper(columns=fields_input).process( processor_input=processor_input) return merged_data.add_cols(partial_data)
def get(self) -> StandardDataFormat: if not AppConfig.get_config_or_default('general.enable_cache', default=True): self.logger.info("caching disabled") return self._get() import json import os logger = self.logger cache_id = hashlib.sha256( json.dumps(self.source_description, sort_keys=True).encode('utf-8')).hexdigest() logger.info( "caching source is enabled. Cache-Id is {0}".format(cache_id)) path_to_cache = os.path.join(AppConfig['general.dir_cache'], "cache_{0}".format(cache_id)) logger.info(f"path for cache is: {path_to_cache}") if os.path.isfile(path_to_cache): logger.info(MSG_CACHED_VERSION_FOUND.format(path_to_cache)) df = pd.read_pickle(path_to_cache) return StandardDataFormat.from_dataframe(df) else: os.makedirs(AppConfig['general.dir_cache'], exist_ok=True) logger.info(MSG_CACHED_VERSION_NOT_FOUND) data = self._get() data.to_dataframe().to_pickle(path_to_cache) return data
def _fetch(self, _fields: List[Field]) -> StandardDataFormat: self.logger.info( f"getting data from time period {self.date_from.isoformat()} - {self.date_to.isoformat()}" ) self.logger.info( f"using username for authentification {self.session.auth.username}" ) series = [] for field in _fields: self.logger.info( f"getting data for field: {field.name} aka. {field.alias}") url = self._get_url(sensor_id=field.name, date_from=self.date_from, date_to=self.date_to) self.logger.debug(f"download data from: {url}") serie = pd.read_json(self._get_data(url, field)) serie = serie.set_index('timestamp') serie.index = serie.index.round("T") serie = serie.rename({"value": field.name}, axis=1) dups = serie.index.duplicated() if len(dups): self.logger.info( f"dropping {len(dups)} duplicated (index) measurements from {field}" ) serie = serie[~dups] series.append(serie) self.logger.debug(f"received rows: {len(serie)}") df = pd.concat(series, axis=1) return StandardDataFormat.from_dataframe(df)
def test_aggregation_call(self): data = np.random.random((10, 4)) data[:, 0] = np.arange(0, 10) data[:, 1] = np.arange(10, 20) data.flags.writeable = False expected_param = np.zeros((2, 9, 2)) expected_param[0, :, 0] = np.arange(0, 9) expected_param[0, :, 1] = np.arange(10, 19) expected_param[1, :, 0] = np.arange(1, 10) expected_param[1, :, 1] = np.arange(11, 20) expected_data = np.full((10, 4 + 2), fill_value=np.nan) expected_data[:, :4] = data expected_data[-2:, 4:] = 9 agg = DummyAggreagor(sequence=9, generate=[ InputOutputField(inputField='abc', outputField='hello'), InputOutputField(inputField='xyz', outputField='world'), ]) input_format = StandardDataFormat( timestamps=helper_data.generate_timestamps(10, samples=data.shape[0]), labels=['abc', 'xyz', 'aaa', 'bbb'], data=data) result = agg.process(input_format) assert_array_equal(agg.grouped_data, expected_param) self.assertEqual(result.labels, ['abc', 'xyz', 'aaa', 'bbb', 'hello', 'world']) assert_array_equal(result.data, expected_data)
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: ix = np.arange(processor_input.data.shape[0]) np.random.shuffle(ix) return processor_input.modify_copy( timestamps=processor_input.timestamps[ix], data=processor_input.data[ix])
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: selector = LabelSelector(processor_input.labels).select(self.fields) data = processor_input.data.copy() for col_id in selector.indexes: selection = data[:, col_id] ix_nans = np.isnan(selection) data[ix_nans, col_id] = self.replacement return processor_input.modify_copy(data=data)
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: time_extracted = self._extractor( pd.Series(processor_input.timestamps.copy())).reshape(-1, 1) data = np.hstack((processor_input.data, time_extracted)) return StandardDataFormat(labels=processor_input.labels + [self.output_field], timestamps=processor_input.timestamps, data=data)
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: df = pd.DataFrame(data=processor_input.data) data = df.interpolate(method=self.method, limit_direction='forward', limit=self.threshold).values return StandardDataFormat(labels=processor_input.labels, timestamps=processor_input.timestamps, data=data)
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: # index having zero nan in row ix_valid = np.sum(np.isnan(processor_input.data), axis=1) == 0 return StandardDataFormat( labels=processor_input.labels, timestamps=processor_input.timestamps[ix_valid], data=processor_input.data[ix_valid])
def test_no_outliers(self): data = np.array([[10, 30], [12, 25]], dtype="float64") data.flags.writeable = False input_data = StandardDataFormat( labels=['a', 'b'], data=data, timestamps=helper_data.generate_timestamps(2, 2)) # noinspection PyArgumentList configs: List[InputOutputLimits] = [InputOutputLimits(inputField="a")] result = OutlierRemover(generate=configs)._process2d(input_data) assert_array_equal(data, result.data)
def test_min_outlier(self): data = np.array([[10, 30], [12, 25]], dtype="float64") data.flags.writeable = False input_data = StandardDataFormat( labels=['a', 'b'], data=data, timestamps=helper_data.generate_timestamps(2, 2)) limits = [{'inputField': 'b', 'min': 28}] result_expected = data.copy() result_expected[1, 1] = np.nan result = OutlierRemover(generate=limits)._process2d(input_data) assert_array_equal(result_expected, result.data)
def test_column_dropper(self): timestamps = np.arange(datetime(2019, 7, 2, 12, 0), datetime(2019, 7, 2, 20, 0), timedelta(minutes=15)).astype(datetime) timestamps.flags.writeable = False data = np.random.random((5, 3)) data.flags.writeable = False processor_data = StandardDataFormat( labels=["preis", "temperatur", "feuchtigkeit"], timestamps=timestamps[:5], data=data) result_expected = StandardDataFormat(labels=["preis", "feuchtigkeit"], timestamps=timestamps[:5], data=processor_data.data[:, [0, 2]]) result = ColumnDropper( columns=["temperatur"])._process2d(processor_data) self.assertListEqual(result_expected.labels, result.labels) assert_array_equal(result_expected.timestamps, result.timestamps) assert_array_equal(result_expected.data, result.data)
def test_standard_case(self): data = np.array([ [23, 55], [21, 52], [np.nan, 52], [23, np.nan], ]) data.flags.writeable = False result_excepted = data[:2] process_data = StandardDataFormat( data=data, labels=['a', 'b'], timestamps=helper_data.generate_timestamps(2, samples=4)) result = NanRemover()._process2d(process_data) assert_array_equal(result_excepted, result.data)
def test_standard_case(self): data = np.array([[11, 20], [12, 21], [13, 22], [14, 20], [15, 20], [16, 20], [17, 20], [18, 20]], dtype='float') result_expected = np.array( [[11, 20], [12, 21], [13, 22], [14, 20], [15, 20], [16, 20], [17, np.nan], [18, np.nan]], dtype='float') processor_data = StandardDataFormat( timestamps=helper_data.generate_timestamps(samples=data.shape[0]), labels=helper_data.get_labels(2), data=data) processor_data_result = FreezedValueRemover( max_freezed_values=3)._process2d(processor_data) assert_array_equal(result_expected, processor_data_result.data)
def test_range_encoder(self): generates = [{'inputField': 'hour', 'outputField': 'hourOneHot'}] encoder = RangeEncoder(generate=generates, value_from=0, value_to=3) data = transform_to_2d_matrix(np.array([0, 1, 1, 2])) result_expected = np.zeros((4, 3)) result_expected[0, 0] = 1 result_expected[1, 1] = 1 result_expected[2, 1] = 1 result_expected[3, 2] = 1 result = encoder._process2d( StandardDataFormat(data=data, labels=['hour'], timestamps=generate_timestamps( samples=result_expected.shape[0]))) assert_array_equal(result_expected, result.data)
class TestStandardDataFormat(unittest.TestCase): def setUp(self) -> None: np.random.seed(1) self.sdf = StandardDataFormat( labels=['a', 'b'], timestamps=np.array([ datetime(2019, 7, 2, 12, 0), datetime(2019, 7, 2, 12, 1), datetime(2019, 7, 2, 12, 2), datetime(2019, 7, 2, 12, 3), datetime(2019, 7, 2, 12, 4), ]), data=np.round(np.random.random((5, 2)) * 100) ) def test_to_dataframe(self): df = self.sdf.to_dataframe() self.assertEqual(self.sdf.labels, df.columns.values.tolist()) assert_array_equal(self.sdf.timestamps, df.index.values) assert_array_equal(self.sdf.data, df.values)
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: partial_data = ColumnSelector( self.fields).process(processor_input).data transformer_restored = self.state if transformer_restored: partial_data = transformer_restored.transform(partial_data) else: transformer = create_instance(qualified_name=self.scaler, kwargs=self.kwargs) partial_data = transformer.fit_transform(partial_data) # note: state will be saved only during training process # each training has it unique identificator which is associated with state data self.state = transformer label_selection = LabelSelector( elements=processor_input.labels).select(self.fields) data = processor_input.data.copy() data[:, label_selection.indexes] = partial_data return processor_input.modify_copy(data=data)
def get(self) -> StandardDataFormat: required_fields = self._get_fields() raw = self._fetch(required_fields) AbstractDatasourceAdapter._check_fields_availability( raw, required_fields, using_alias=self.source_returns_alias) labels_new = [f.alias for f in required_fields] if self.source_returns_alias: ix_selection = LabelSelector(elements=raw.labels).select( selection=[f.alias for f in required_fields]).indexes else: ix_selection = LabelSelector(elements=raw.labels).select( selection=[f.name for f in required_fields]).indexes # ensure ordering of columns are correct data = raw.data[:, ix_selection] return StandardDataFormat(timestamps=raw.timestamps, labels=labels_new, data=data)
def test_multi_aggregation(self): delta = timedelta(minutes=5) start_date = datetime(2019, 7, 1, 12, 1) end_date = start_date + 10 * delta data = StandardDataFormat( labels=["temp1", "temp2"], timestamps=np.arange(start_date, end_date, delta).astype(datetime), data=np.array([np.arange(0, 10), np.arange(10, 20)]).T) data.timestamps.flags.writeable = False data.data.flags.writeable = False multi = MultiAggregation( sequence=5, # todo: use sequence instead minutes instances=[ Max(sequence=5, generate=[{ "inputField": "temp1", "outputField": "temp1Max" }]), Min(sequence=5, generate=[{ "inputField": "temp1", "outputField": "temp1Min" }]) ]) multi.instances[0].id = "0" multi.instances[1].id = "1" result = PipelineExecutor(pipeline=[multi]).execute(data=data) self.assertEqual(['temp1', 'temp2', 'temp1Max', 'temp1Min'], result.labels) assert_array_equal(data.timestamps, result.timestamps) assert_array_equal(np.full((4, 2), fill_value=np.nan), result.data[:4, [2, 3]]) assert_array_equal(np.arange(4, 10), result.data[4:, 2]) assert_array_equal(np.arange(6), result.data[4:, 3])
def test_range_encoder_two_cols(self): generates = [{'inputField': 'hour', 'outputField': 'hourOneHot'}] encoder = RangeEncoder(generate=generates, value_from=0, value_to=3) data = np.array([[0, 1, 1, 2], [1, 2, 3, 4]]).T result_expected = np.zeros((4, 4)) result_expected[0, 1 + 0] = 1 result_expected[1, 1 + 1] = 1 result_expected[2, 1 + 1] = 1 result_expected[3, 1 + 2] = 1 result_expected[:, 0] = np.array([1, 2, 3, 4]) result = encoder._process2d( StandardDataFormat(data=data, labels=['hour', 'abc'], timestamps=generate_timestamps( samples=result_expected.shape[0]))) assert_array_equal(result_expected, result.data) self.assertEqual("abc", result.labels[0]) self.assertEqual("hourOneHot$0", result.labels[1]) self.assertEqual("hourOneHot$1", result.labels[2]) self.assertEqual("hourOneHot$2", result.labels[3])
def test_resampling(self): rows = 2 timestamps = np.array([ datetime(2019, 7, 2, 12, 0), datetime(2019, 7, 2, 12, 3), ]) timestamps.flags.writeable = False data = np.random.random((rows, 3)) * 100 data.flags.writeable = False processor_data = StandardDataFormat( labels=["temperatur", "feuchtigkeit", "preis"], data=data, timestamps=timestamps) result = Resampler(freq="1min")._process2d(processor_data) # expected rows: 0, 1, 2, 3 => 4 rows expected_timestamps = np.arange(datetime(2019, 7, 2, 12, 0), datetime(2019, 7, 2, 12, 4), timedelta(minutes=1)) self.assertListEqual(result.labels, ["temperatur", "feuchtigkeit", "preis"]) self.assertTupleEqual(result.data.shape, (expected_timestamps.shape[0], 3)) for i in range(expected_timestamps.shape[0]): self.assertEqual(expected_timestamps[i], result.timestamps[i]) assert_array_equal(data[0], result.data[0]) assert_array_equal(data[-1], result.data[-1]) assert_array_equal(result.data[1:-1], np.full((2, 3), fill_value=np.nan))
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: fields_in = [ ConfigReader.from_dict(g).get_or_error( key="inputField", context="OutlierRemover Config") for g in self.generate ] cols_selected = LabelSelector(elements=processor_input.labels).select( selection=fields_in).indexes # we expend feature-array to have a 3D-array with only one entry (of 2D-Array) grouped_data = np.ma.array( np.expand_dims(processor_input.data[:, cols_selected], axis=0)) affected_index = Outlier(sequence=np.nan, generate=self.generate)\ .affected_index(grouped_data=grouped_data) affected_index = np.squeeze(affected_index, axis=0) data = processor_input.data.copy() t = data[:, cols_selected] t[affected_index] = np.nan data[:, cols_selected] = t return processor_input.modify_copy(data=data)
def _process2d(self, processor_input: StandardDataFormat) -> StandardDataFormat: mask = get_freezed_value_mask(processor_input.data, self.max_freezed_values) data = processor_input.data.copy() data[mask] = np.nan return processor_input.modify_copy(data=data)
import datetime import unittest import numpy as np from numpy.testing import assert_array_equal from mlpipe.processors.standard_data_format import StandardDataFormat from mlpipe.processors.time_extractor import TimeExtractor test_data = StandardDataFormat( timestamps=np.array([ datetime.datetime(2019, 7, 2, 12, 25), # tuesday datetime.datetime(2019, 8, 3, 20, 45), # wednesday ]), data=np.arange(2).reshape(-1, 1), labels=["example_data"]) class TestTimeExtractor(unittest.TestCase): def test_extract_hours(self): result_expected = np.hstack( (test_data.data, np.array([12, 21]).reshape(-1, 1))) result = TimeExtractor( extract="hour", outputField="example_hour")._process2d(test_data) assert_array_equal(result_expected, result.data) def test_extract_weekday(self): # note weekday start from monday. 0 = monday result_expected = np.hstack( (test_data.data, np.array([1, 5]).reshape(-1, 1))) result = TimeExtractor(