Beispiel #1
0
    def test_row_filter(self):
        """Tests that row_filter filters away rows"""

        tag_list = [
            SensorTag("Tag 1", None),
            SensorTag("Tag 2", None),
            SensorTag("Tag 3", None),
        ]
        start = dateutil.parser.isoparse("2017-12-25 06:00:00Z")
        end = dateutil.parser.isoparse("2017-12-29 06:00:00Z")
        X, _ = TimeSeriesDataset(
            MockDataSource(), start, end, tag_list=tag_list
        ).get_data()

        self.assertEqual(577, len(X))

        X, _ = TimeSeriesDataset(
            MockDataSource(), start, end, tag_list=tag_list, row_filter="'Tag 1' < 5000"
        ).get_data()

        self.assertEqual(8, len(X))

        X, _ = TimeSeriesDataset(
            MockDataSource(),
            start,
            end,
            tag_list=tag_list,
            row_filter="'Tag 1' / 'Tag 3' < 0.999",
        ).get_data()

        self.assertEqual(3, len(X))
Beispiel #2
0
    def _raw_data(self, endpoint: EndpointMetadata, start: datetime, end: datetime) -> pd.DataFrame:
        """
        Fetch the required raw data in this time range which would
        satisfy this endpoint's /prediction POST
        Parameters
        ----------
        endpoint: EndpointMetadata
            Named tuple representing the endpoint info from Watchman
        start: datetime
        end: datetime
        Returns
        -------
        pandas.core.DataFrame
            Dataframe of required tags and index reflecting the datetime point
        """

        # We want to adjust for any model offset. If the model outputs less than it got in, it requires
        # extra data than what we're being asked to get predictions for.
        # just to give us some buffer zone.
        start = self._adjust_for_offset(dt=start, resolution=endpoint.resolution, n_intervals=endpoint.model_offset + 5)
        dataset = TimeSeriesDataset(
            data_provider=self.data_provider,  # type: ignore
            from_ts=start,
            to_ts=end,
            resolution=endpoint.resolution,
            tag_list=endpoint.tag_list,
            target_tag_list=endpoint.target_tag_list,
        )
        return dataset.get_data()
Beispiel #3
0
    def get(self):
        """
        Process a GET request by fetching data ourselves
        """
        context = dict()  # type: typing.Dict[str, typing.Any]
        context["status-code"] = 200

        params = request.get_json() or request.args

        if not all(k in params for k in ("start", "end")):
            message = dict(
                message="must provide iso8601 formatted dates with timezone-information for parameters 'start' and 'end'"
            )
            return make_response((jsonify(message), 400))

        try:
            start = self._parse_iso_datetime(params["start"])
            end = self._parse_iso_datetime(params["end"])
        except ValueError:
            logger.error(
                f"Failed to parse start and/or end date to ISO: start: "
                f"{params['start']} - end: {params['end']}"
            )
            message = dict(
                message="Could not parse start/end date(s) into ISO datetime. must provide iso8601 formatted dates for both."
            )
            return make_response((jsonify(message), 400))

        # Make request time span of one day
        if (end - start).days:
            message = dict(message="Need to request a time span less than 24 hours.")
            return make_response((jsonify(message), 400))
        logger.debug("Fetching data from data provider")
        before_data_fetch = timeit.default_timer()
        dataset = TimeSeriesDataset(
            data_provider=g.data_provider,
            from_ts=start - self.frequency.delta,
            to_ts=end,
            resolution=current_app.metadata["dataset"]["resolution"],
            tag_list=self.tags,
        )
        X, _y = dataset.get_data()
        logger.debug(
            f"Fetching data from data provider took "
            f"{timeit.default_timer()-before_data_fetch} seconds"
        )
        # Want resampled buckets equal or greater than start, but less than end
        # b/c if end == 00:00:00 and req = 10 mins, a resampled bucket starting
        # at 00:00:00 would imply it has data until 00:10:00; which is passed
        # the requested end datetime
        X = X[
            (X.index > start - self.frequency.delta)
            & (X.index + self.frequency.delta < end)
        ]
        return self._process_request(context=context, X=X)
Beispiel #4
0
def test_time_series_no_resolution():
    kwargs = dict(
        data_provider=MockDataSource(),
        tag_list=[
            SensorTag("Tag 1", None),
            SensorTag("Tag 2", None),
            SensorTag("Tag 3", None),
        ],
        from_ts=dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
        to_ts=dateutil.parser.isoparse("2017-12-29 06:00:00Z"),
    )
    no_resolution, _ = TimeSeriesDataset(resolution=None, **kwargs).get_data()
    wi_resolution, _ = TimeSeriesDataset(resolution="10T", **kwargs).get_data()
    assert len(no_resolution) > len(wi_resolution)
    def test_faked_DataLakeBackedDataset(self, _mocked_method):

        config = dict(
            from_ts=dateutil.parser.isoparse("2014-07-01T00:10:00+00:00"),
            to_ts=dateutil.parser.isoparse("2015-01-01T00:00:00+00:00"),
            tag_list=[
                "asgb.19ZT3950%2FY%2FPRIM",
                "asgb.19PST3925%2FDispMeasOut%2FPRIM",
            ],
        )

        provider = DataLakeProvider(storename="dataplatformdlsprod", interactive=True)
        dataset = TimeSeriesDataset(data_provider=provider, **config)

        # Should be able to call get_data without being asked to authenticate in tests
        X, y = dataset.get_data()
Beispiel #6
0
    async def _raw_data(
        self, endpoint: EndpointMetadata, start: datetime, end: datetime
    ) -> pd.DataFrame:
        """
        Fetch the required raw data in this time range which would
        satisfy this endpoint's /prediction POST

        Parameters
        ----------
        endpoint: EndpointMetadata
            Named tuple representing the endpoint info from Watchman
        start: datetime
        end: datetime

        Returns
        -------
        pandas.core.DataFrame
            Dataframe of required tags and index reflecting the datetime point
        """
        freq = pd.tseries.frequencies.to_offset(endpoint.resolution)

        dataset = TimeSeriesDataset(  # type: ignore
            data_provider=self.data_provider,
            from_ts=start - freq.delta,
            to_ts=end,
            resolution=endpoint.resolution,
            tag_list=endpoint.tag_list,
        )

        with ThreadPoolExecutor(max_workers=1) as executor:
            future = executor.submit(dataset.get_data)
            return await asyncio.wrap_future(future)
    def test_aggregation_methods(self):
        """Tests that it works to set aggregation method(s)"""

        tag_list = [
            SensorTag("Tag 1", None),
            SensorTag("Tag 2", None),
            SensorTag("Tag 3", None),
        ]
        start = dateutil.parser.isoparse("2017-12-25 06:00:00Z")
        end = dateutil.parser.isoparse("2017-12-29 06:00:00Z")

        # Default aggregation gives no extra columns
        X, _ = TimeSeriesDataset(MockDataSource(),
                                 start,
                                 end,
                                 tag_list=tag_list).get_data()

        self.assertEqual((577, 3), X.shape)
        # The default single aggregation method gives the tag-names as columns
        self.assertEqual(list(X.columns), ["Tag 1", "Tag 2", "Tag 3"])

        # Using two aggregation methods give a multi-level column with tag-names
        # on top and aggregation_method as second level
        X, _ = TimeSeriesDataset(
            MockDataSource(),
            start,
            end,
            tag_list=tag_list,
            aggregation_methods=["mean", "max"],
        ).get_data()

        self.assertEqual((577, 6), X.shape)
        self.assertEqual(
            list(X.columns),
            [
                ("Tag 1", "mean"),
                ("Tag 1", "max"),
                ("Tag 2", "mean"),
                ("Tag 2", "max"),
                ("Tag 3", "mean"),
                ("Tag 3", "max"),
            ],
        )
Beispiel #8
0
def test_timeseries_target_tags(tag_list, target_tag_list):
    start = dateutil.parser.isoparse("2017-12-25 06:00:00Z")
    end = dateutil.parser.isoparse("2017-12-29 06:00:00Z")
    tsd = TimeSeriesDataset(MockDataSource(), start, end, tag_list, target_tag_list)
    X, y = tsd.get_data()

    # If we have targets, y and X should be equal length and axis=1 == N in target tag list
    if target_tag_list:
        assert len(X) == len(y)
        assert y.shape[1] == len(target_tag_list)

        # Ensure the order in maintained
        assert [tag.name for tag in target_tag_list] == y.columns.tolist()
    else:
        assert y is None

    # Features should match the tag_list
    assert X.shape[1] == len(tag_list)

    # Ensure the order in maintained
    assert [tag.name for tag in tag_list] == X.columns.tolist()
Beispiel #9
0
    def test_row_filter(self):
        """Tests that row_filter filters away rows"""
        kwargs = dict(
            data_provider=MockDataSource(),
            tag_list=[
                SensorTag("Tag 1", None),
                SensorTag("Tag 2", None),
                SensorTag("Tag 3", None),
            ],
            from_ts=dateutil.parser.isoparse("2017-12-25 06:00:00Z"),
            to_ts=dateutil.parser.isoparse("2017-12-29 06:00:00Z"),
        )
        X, _ = TimeSeriesDataset(**kwargs).get_data()
        self.assertEqual(577, len(X))

        X, _ = TimeSeriesDataset(row_filter="'Tag 1' < 5000", **kwargs).get_data()
        self.assertEqual(8, len(X))

        X, _ = TimeSeriesDataset(
            row_filter="'Tag 1' / 'Tag 3' < 0.999", **kwargs
        ).get_data()
        self.assertEqual(3, len(X))
Beispiel #10
0
    def wrapper_method(self, *args, **kwargs):

        # Data provided by the client
        if request.method == "POST":
            X = request.json.get("X")
            y = request.json.get("y")

            if X is None:
                message = dict(message='Cannot predict without "X"')
                return make_response((jsonify(message), 400))

            # Convert X and (maybe) y into dataframes.
            X = dataframe_from_dict(X,
                                    tags=list(tag.name for tag in self.tags),
                                    name="X")

            # Y is ok to be None for BaseView, view(s) like Anomaly might require it.
            if y is not None and self.target_tags:
                y = dataframe_from_dict(y,
                                        list(tag.name
                                             for tag in self.target_tags),
                                        name="y")

            # If either X or y came back as a Response type, there was an error
            for data_or_resp in [X, y]:
                if isinstance(data_or_resp, Response):
                    return data_or_resp

        # Data must be queried from Influx given dates passed in request.
        elif request.method == "GET":

            params = request.get_json() or request.args

            if not all(k in params for k in ("start", "end")):
                message = dict(
                    message=
                    "must provide iso8601 formatted dates with timezone-information for parameters 'start' and 'end'"
                )
                return make_response((jsonify(message), 400))

            # Extract the dates from parameters
            try:
                start = parse_iso_datetime(params["start"])
                end = parse_iso_datetime(params["end"])
            except ValueError:
                logger.error(
                    f"Failed to parse start and/or end date to ISO: start: "
                    f"{params['start']} - end: {params['end']}")
                message = dict(
                    message=
                    "Could not parse start/end date(s) into ISO datetime. must provide iso8601 formatted dates for both."
                )
                return make_response((jsonify(message), 400))

            # Make request time span of one day
            if (end - start).days:
                message = dict(
                    message="Need to request a time span less than 24 hours.")
                return make_response((jsonify(message), 400))

            logger.debug("Fetching data from data provider")
            before_data_fetch = timeit.default_timer()
            dataset = TimeSeriesDataset(
                data_provider=g.data_provider,
                from_ts=start - self.frequency.delta,
                to_ts=end,
                resolution=current_app.metadata["dataset"]["resolution"],
                tag_list=self.tags,
                target_tag_list=self.target_tags or None,
            )
            X, y = dataset.get_data()
            logger.debug(f"Fetching data from data provider took "
                         f"{timeit.default_timer()-before_data_fetch} seconds")
            # Want resampled buckets equal or greater than start, but less than end
            # b/c if end == 00:00:00 and req = 10 mins, a resampled bucket starting
            # at 00:00:00 would imply it has data until 00:10:00; which is passed
            # the requested end datetime
            X = X[(X.index > start - self.frequency.delta)
                  & (X.index + self.frequency.delta < end)]

            # TODO: Remove/rework this once we add target_tags assignments in workflow generator for autoencoders.
            if y is None:
                y = X.copy()
            else:
                y = y.loc[X.index]

        else:
            raise NotImplementedError(
                f"Cannot extract X and y from '{request.method}' request.")

        # Assign X and y to the request's global context
        g.X, g.y = X, y

        # And run the original method.
        return method(self, *args, **kwargs)
Beispiel #11
0
    def get(self):

        context = dict()  # type: typing.Dict[str, typing.Any]
        context["status-code"] = 200
        start_time = timeit.default_timer()

        params = request.get_json() or request.args

        if not all(k in params for k in ("start", "end")):
            return (
                {
                    "error":
                    "must provide iso8601 formatted dates with "
                    "timezone-information for parameters 'start' and 'end'"
                },
                400,
            )

        try:
            start = self._parse_iso_datetime(params["start"])
            end = self._parse_iso_datetime(params["end"])
        except ValueError:
            logger.error(
                f"Failed to parse start and/or end date to ISO: start: "
                f"{params['start']} - end: {params['end']}")
            return (
                {
                    "error":
                    "Could not parse start/end date(s) into ISO datetime. "
                    "must provide iso8601 formatted dates for both."
                },
                400,
            )

        # Make request time span of one day
        if (end - start).days:
            return {
                "error": "Need to request a time span less than 24 hours."
            }, 400

        freq = pd.tseries.frequencies.to_offset(
            current_app.metadata["dataset"]["resolution"])

        dataset = TimeSeriesDataset(
            data_provider=g.data_provider,
            from_ts=start - freq.delta,
            to_ts=end,
            resolution=current_app.metadata["dataset"]["resolution"],
            tag_list=sensor_tag.normalize_sensor_tags(
                current_app.metadata["dataset"]["tag_list"]),
        )
        X, _y = dataset.get_data()

        # Want resampled buckets equal or greater than start, but less than end
        # b/c if end == 00:00:00 and req = 10 mins, a resampled bucket starting
        # at 00:00:00 would imply it has data until 00:10:00; which is passed
        # the requested end datetime
        X = X[(X.index > start - freq.delta) & (X.index + freq.delta < end)]

        try:
            xhat = self.get_predictions(X).tolist()

        # Model may only be a transformer, probably an AttributeError, but catch all to avoid logging other
        # exceptions twice if it happens.
        except Exception as exc:
            logger.critical(f"Failed to predict or transform; error: {exc}")
            return (
                {
                    "error":
                    "Something unexpected happened; check your input data"
                },
                400,
            )

        # In GET requests we need to pair the resulting predictions with their
        # specific timestamp and additionally match the predictions to the corresponding tags.
        data = []

        # This tags list is just for display/informative purposes, skipping the asset
        tags = [
            tag["name"] for tag in current_app.metadata["dataset"]["tag_list"]
        ]

        for prediction, time_stamp in zip(xhat, X.index[-len(xhat):]):

            # Auto encoders return double their input.
            # First half is input to model, second half is output of model
            tag_inputs = np.array(prediction[:len(tags)])
            tag_outputs = np.array(prediction[len(tags):])
            tag_errors = np.abs(tag_inputs - tag_outputs)
            data.append({
                "start":
                f"{time_stamp}",
                "end":
                f"{time_stamp + freq}",
                "tags": {tag: error
                         for tag, error in zip(tags, tag_errors)},
                "total_anomaly":
                np.linalg.norm(tag_inputs - tag_outputs),
            })
        context["output"] = data
        context["time-seconds"] = f"{timeit.default_timer() - start_time:.4f}"
        return context, context["status-code"]