Example #1
0
    def __init__(self,
                 input_steps: Optional[Dict[str, "BaseStep"]] = None,
                 targets: Optional[Dict[str, "BaseStep"]] = None,
                 condition=None,
                 computation_mode=ComputationMode.Default,
                 name="BaseStep"):
        self.default_run_setting = RunSetting(
            computation_mode=computation_mode)
        self.current_run_setting = self.default_run_setting.clone()
        self.input_steps: Dict[
            str, "BaseStep"] = dict() if input_steps is None else input_steps
        self.targets: Dict[str,
                           "BaseStep"] = dict() if targets is None else targets
        self.condition = condition
        self.cached_result = {"cached": None, "start": None, "end": None}

        self.name = name

        self.id = -1
        self.finished = False
        self.last = True
        self._current_end = None
        self.buffer: Dict[str, xr.DataArray] = {}
        self.training_time = SummaryObjectList(
            self.name + " Training Time", category=SummaryCategory.FitTime)
        self.transform_time = SummaryObjectList(
            self.name + " Transform Time",
            category=SummaryCategory.TransformTime)
Example #2
0
    def test_set_run_setting(self):
        step = Step(MagicMock(), MagicMock(), MagicMock())
        step.set_run_setting(RunSetting(ComputationMode.FitTransform))

        assert step.current_run_setting.computation_mode == ComputationMode.FitTransform

        step.set_run_setting(RunSetting(ComputationMode.Transform))
        assert step.current_run_setting.computation_mode == ComputationMode.Transform
Example #3
0
    def test_update_computation_mode_not_updatable(self):
        run_setting = RunSetting(computation_mode=ComputationMode.Train)
        run_setting.update(RunSetting(computation_mode=ComputationMode.Transform))
        self.assertEqual(run_setting.computation_mode, ComputationMode.Train)

        run_setting = RunSetting(computation_mode=ComputationMode.Transform)
        run_setting.update(RunSetting(computation_mode=ComputationMode.Train))
        self.assertEqual(run_setting.computation_mode, ComputationMode.Transform)
Example #4
0
    def _run(self, data: Union[pd.DataFrame,
                               xr.Dataset], mode: ComputationMode,
             summary: bool, summary_formatter: SummaryFormatter):

        for step in self.id_to_step.values():
            step.reset()
            step.set_run_setting(
                RunSetting(computation_mode=mode,
                           summary_formatter=summary_formatter))

        if isinstance(data, pd.DataFrame):
            data = data.to_xarray()

        if isinstance(data, xr.Dataset):
            result = self.transform(
                **{key: data[key]
                   for key in data.data_vars})
            sum = self._create_summary(summary_formatter)
            return (result, sum) if summary else result
        elif isinstance(data, dict):
            for key in data:
                if not isinstance(data[key], xr.DataArray):
                    raise WrongParameterException(
                        "Input Dict does not contain xr.DataArray objects.",
                        "Make sure to pass Dict[str, xr.DataArray].",
                        self.name)
            result = self.transform(**data)
            sum = self._create_summary(summary_formatter)
            return (result, sum) if summary else result

        raise WrongParameterException(
            "Unkown data type to pass to pipeline steps.",
            "Make sure to use pandas DataFrames, xarray Datasets, or Dict[str, xr.DataArray].",
            self.name)
Example #5
0
    def test_reset(self):
        step = Step(MagicMock(), MagicMock(), MagicMock())
        step.buffer = MagicMock()
        step.current_run_setting = RunSetting(computation_mode=ComputationMode.Transform)
        step.finished = True
        step.reset()

        self.assertIsNone(None)
        assert step.current_run_setting.computation_mode == ComputationMode.Default
        assert step._should_stop(None, None) == False
        assert step.finished == False
Example #6
0
class TestRunSetting(unittest.TestCase):

    def setUp(self) -> None:
        self.run_setting = RunSetting(computation_mode=ComputationMode.Default)

    def tearDown(self) -> None:
        self.run_setting = None

    def test_update(self):
        run_setting = self.run_setting.update(RunSetting(computation_mode=ComputationMode.Train))
        self.assertEqual(run_setting.computation_mode, ComputationMode.Train)

    def test_update_computation_mode_not_updatable(self):
        run_setting = RunSetting(computation_mode=ComputationMode.Train)
        run_setting.update(RunSetting(computation_mode=ComputationMode.Transform))
        self.assertEqual(run_setting.computation_mode, ComputationMode.Train)

        run_setting = RunSetting(computation_mode=ComputationMode.Transform)
        run_setting.update(RunSetting(computation_mode=ComputationMode.Train))
        self.assertEqual(run_setting.computation_mode, ComputationMode.Transform)

    def test_clone(self):
        run_setting = self.run_setting.clone()

        self.assertEqual(run_setting.computation_mode, self.run_setting.computation_mode)

    def test_save(self):
        json = self.run_setting.save()

        self.assertEqual(json, {
            "computation_mode": 4
        })

    def test_load(self):
        run_setting = RunSetting.load({
            "computation_mode": 4
        })
        self.assertEqual(run_setting.computation_mode, ComputationMode.Default)
Example #7
0
    def test_batched_pipeline(self, concat_mock, fm_mock):
        # Add some steps to the pipeline

        time = pd.date_range('2000-01-01', freq='1H', periods=7)
        da = xr.DataArray([2, 3, 4, 3, 3, 1, 2],
                          dims=["time"],
                          coords={'time': time})

        # Assert that the computation is set to fit_transform if the ComputationMode was default
        first_step = MagicMock()
        first_step.run_setting = RunSetting(ComputationMode.Default)
        first_step.finished = False
        first_step.further_elements.side_effect = [
            True, True, True, True, False
        ]

        first_step.get_result.return_value = {"one": da}
        self.pipeline.set_params(pd.Timedelta("24h"))
        self.pipeline.add(module=first_step)

        data = pd.DataFrame({
            "test": [1, 2, 2, 3],
            "test2": [2, 2, 2, 2]
        },
                            index=pd.DatetimeIndex(
                                pd.date_range('2000-01-01',
                                              freq='24H',
                                              periods=4)))
        self.pipeline.test(data)

        first_step.set_run_setting.assert_called_once()
        self.assertEqual(
            first_step.set_run_setting.call_args[0][0].computation_mode,
            ComputationMode.Transform)
        calls = [
            call(pd.Timestamp('2000-01-01 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-02 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-02 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-03 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-03 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-04 00:00:00', freq='24H'),
                 return_all=True),
            call(pd.Timestamp('2000-01-04 00:00:00', freq='24H'),
                 pd.Timestamp('2000-01-05 00:00:00', freq='24H'),
                 return_all=True),
        ]
        first_step.get_result.assert_has_calls(calls, any_order=True)
        self.assertEqual(concat_mock.call_count, 3)
Example #8
0
    def load(cls, stored_step: Dict, inputs, targets, module, file_manager):
        """
        Load a stored step.

        :param stored_step: Informations about the stored step
        :param inputs: The input step of the stored step
        :param targets: The target step of the stored step
        :param module: The module wrapped by this step
        :return: Step
        """
        if stored_step["condition"]:
            with open(stored_step["condition"], 'rb') as pickle_file:
                condition = cloudpickle.load(pickle_file)
        else:
            condition = None
        if stored_step["train_if"]:
            with open(stored_step["train_if"], 'rb') as pickle_file:
                train_if = cloudpickle.load(pickle_file)
        else:
            train_if = None
        callbacks = []
        for callback_path in stored_step["callbacks"]:
            with open(callback_path, 'rb') as pickle_file:
                callback = cloudpickle.load(pickle_file)
            callback.set_filemanager(file_manager)
            callbacks.append(callback)

        step = cls(module,
                   inputs,
                   targets=targets,
                   file_manager=file_manager,
                   condition=condition,
                   train_if=train_if,
                   callbacks=callbacks,
                   batch_size=stored_step["batch_size"])
        step.default_run_setting = RunSetting.load(
            stored_step["default_run_setting"])
        step.current_run_setting = step.default_run_setting.clone()
        step.id = stored_step["id"]
        step.name = stored_step["name"]
        step.last = stored_step["last"]

        return step
Example #9
0
class BaseStep(ABC):
    """
    The base class of all steps.
    :param input_steps: The input steps
    :type input_steps: Optional[Dict[str, BaseStep]]
    :param targets: The target steps
    :type targets: Optional[Dict[str, BaseStep]]
    :param condition: A function which evaluates to False or True for detecting if the module should be executed.
    :type condition: Callable
    :param computation_mode: The computation mode for this module
    :type computation_mode: ComputationMode
    """
    def __init__(self,
                 input_steps: Optional[Dict[str, "BaseStep"]] = None,
                 targets: Optional[Dict[str, "BaseStep"]] = None,
                 condition=None,
                 computation_mode=ComputationMode.Default,
                 name="BaseStep"):
        self.default_run_setting = RunSetting(
            computation_mode=computation_mode)
        self.current_run_setting = self.default_run_setting.clone()
        self.input_steps: Dict[
            str, "BaseStep"] = dict() if input_steps is None else input_steps
        self.targets: Dict[str,
                           "BaseStep"] = dict() if targets is None else targets
        self.condition = condition
        self.cached_result = {"cached": None, "start": None, "end": None}

        self.name = name

        self.id = -1
        self.finished = False
        self.last = True
        self._current_end = None
        self.buffer: Dict[str, xr.DataArray] = {}
        self.training_time = SummaryObjectList(
            self.name + " Training Time", category=SummaryCategory.FitTime)
        self.transform_time = SummaryObjectList(
            self.name + " Transform Time",
            category=SummaryCategory.TransformTime)

    def get_result(self,
                   start: pd.Timestamp,
                   end: Optional[pd.Timestamp],
                   buffer_element: str = None,
                   return_all=False):
        """
        This method is responsible for providing the result of this step.
        Therefore,
        this method triggers the get_input and get_target data methods.
        Additionally, it triggers the computations and checks if all data are processed.

        :param start: The start date of the requested results of the step
        :type start: pd.Timedstamp
        :param end: The end date of the requested results of the step (exclusive)
        :type end: Optional[pd.Timestamp]
        :param buffer_element: if the buffer of the step contains multiple results, this determines the result which is
                               returned.
        :type buffer_element: str
        :param return_all: Flag that indicates if all results in the buffer should be returned.
        :type return_all: bool
        :return: The resulting data or None if no data are calculated
        """
        # Check if step should be executed.
        if self._should_stop(start, end):
            return None

        # Only execute the module if the step is not finished and the results are not yet calculated
        if not self.finished and not (end is not None
                                      and self._current_end is not None
                                      and end <= self._current_end):
            if not self.buffer or not self._current_end or end > self._current_end:
                self.cached_result["cached"] = self._compute(start, end)
                self.cached_result["start"] = start
                self.cached_result["end"] = end
                self._current_end = end
            if not end:
                self.finished = True
            else:
                self.finished = not self.further_elements(end)

            # Only call callbacks if the step is finished
            if self.finished:
                self._callbacks()

        # Check if the cached results fits to the request, if yes return it.
        if self.cached_result["cached"] is not None and self.cached_result[
                "start"] == start and self.cached_result["end"] == end:
            return copy.deepcopy(
                self.cached_result["cached"]) if return_all else copy.deepcopy(
                    self.cached_result["cached"][buffer_element]
                ) if buffer_element is not None else copy.deepcopy(
                    list(self.cached_result["cached"].values())[0])
        return self._pack_data(start,
                               end,
                               buffer_element,
                               return_all=return_all)

    def _compute(self, start, end) -> Dict[str, xr.DataArray]:
        pass

    def further_elements(self, counter: pd.Timestamp) -> bool:
        """
        Checks if there exist at least one data for the time after counter.

        :param counter: The timestampe for which it should be tested if there exist further data after it.
        :type counter: pd.Timestamp
        :return: True if there exist further data
        :rtype: bool
        """
        if not self.buffer or all([
                counter < b.indexes[_get_time_indexes(self.buffer)[0]][-1]
                for b in self.buffer.values()
        ]):
            return True
        for input_step in self.input_steps.values():
            if not input_step.further_elements(counter):
                return False
        for target_step in self.targets.values():
            if not target_step.further_elements(counter):
                return False
        return True

    def _pack_data(self, start, end, buffer_element=None, return_all=False):
        # Provide requested data
        time_index = _get_time_indexes(self.buffer)
        if end and start and end > start:
            index = list(self.buffer.values())[0].indexes[time_index[0]]
            start = max(index[0], start.to_numpy())
            # After sel copy is not needed, since it returns a new array.
            if buffer_element is not None:
                return self.buffer[buffer_element].sel(
                    **{
                        time_index[0]:
                        index[(index >= start) & (index < end.to_numpy())]
                    })
            elif return_all:
                return {
                    key: b.sel(
                        **{
                            time_index[0]:
                            index[(index >= start) & (index < end.to_numpy())]
                        })
                    for key, b in self.buffer.items()
                }
            else:
                return list(self.buffer.values())[0].sel(
                    **{
                        time_index[0]:
                        index[(index >= start) & (index < end.to_numpy())]
                    })
        else:
            self.finished = True
            if buffer_element is not None:
                return self.buffer[buffer_element].copy()
            elif return_all:
                return copy.deepcopy(self.buffer)
            else:
                return list(self.buffer.values())[0].copy()

    def _transform(self, input_step):
        pass

    def _fit(self, input_step, target_step):
        pass

    def _callbacks(self):
        pass

    def _post_transform(self, result):
        if isinstance(result, dict) and len(result) <= 1:
            result = {self.name: list(result.values())[0]}
        elif not isinstance(result, dict):
            result = {self.name: result}

        if not self.buffer:
            self.buffer = result
        else:
            # Time dimension is mandatory, consequently there dim has to exist
            dim = _get_time_indexes(result)[0]
            for key in self.buffer.keys():
                self.buffer[key] = xr.concat([self.buffer[key], result[key]],
                                             dim=dim)
        return result

    def get_json(self, fm: FileManager) -> Dict:
        """
        Returns a dictionary containing all information needed for restoring the step.

        :param fm: The filemanager which can be used by the step for storing the state of the step.
        :type fm: FileManager
        :return: A dictionary containing all information needed for restoring the step.
        :rtype: Dict
        """
        return {
            "target_ids": {step.id: key
                           for key, step in self.targets.items()},
            "input_ids":
            {step.id: key
             for key, step in self.input_steps.items()},
            "id": self.id,
            "module": self.__module__,
            "class": self.__class__.__name__,
            "name": self.name,
            "last": self.last,
            "default_run_setting": self.default_run_setting.save()
        }

    @classmethod
    @abstractmethod
    def load(cls, stored_step: dict, inputs, targets, module, file_manager):
        """
        Restores the step.

        :param stored_step: Information about the stored step
        :param inputs: The input steps of the step which should be restored
        :param targets: The target steps of the step which should be restored
        :param module: The module which is contained by this step
        :param file_manager: The filemanager of the step
        :return: The restored step.
        """

    def _get_input(self, start, batch):
        return None

    def _get_target(self, start, batch):
        return None

    def _should_stop(self, start, end) -> bool:
        # Fetch input and target data
        input_result = self._get_input(start, end)
        target_result = self._get_target(start, end)

        # Check if either the condition is True or some of the previous steps stopped (return_value is None)
        return (self.condition is not None and not self.condition(input_result, target_result)) or \
               self._input_stopped(input_result) or self._input_stopped(target_result)

    @staticmethod
    def _input_stopped(input_data):
        return (input_data is not None and len(input_data) > 0
                and any(map(lambda x: x is None, input_data.values())))

    def reset(self):
        """
        Resets all information of the step concerning a specific run.
        """
        self.buffer = {}
        self.finished = False
        self.current_run_setting = self.default_run_setting.clone()

    def set_run_setting(self, run_setting: RunSetting):
        """
        Sets the computation mode of the step for the current run. Note that after reset the all mode is restored.
        Moreover, setting the computation_mode is only possible if the computation_mode is not set explicitly while
        adding the corresponding module to the pipeline.

        :param computation_mode: The computation mode which should be set.
        :type computation_mode: ComputationMode
        """
        self.current_run_setting = self.default_run_setting.update(run_setting)
Example #10
0
 def setUp(self) -> None:
     self.run_setting = RunSetting(computation_mode=ComputationMode.Default)
Example #11
0
 def test_load(self):
     run_setting = RunSetting.load({
         "computation_mode": 4
     })
     self.assertEqual(run_setting.computation_mode, ComputationMode.Default)
Example #12
0
 def test_update(self):
     run_setting = self.run_setting.update(RunSetting(computation_mode=ComputationMode.Train))
     self.assertEqual(run_setting.computation_mode, ComputationMode.Train)