Example #1
0
def test_ml_server_dataframe_to_dict_and_back(sensors_str, use_test_project_tags):
    """
    Tests the flow of the server creating a dataframe from the model's data, putting into
    a dict of string to df. lists of values, and the client being able to reconstruct it back
    to the original dataframe (less the second level names)
    """
    # Run test with test project tag names
    if use_test_project_tags:
        tags = sensors_str
    # Run project with random names
    else:
        tags = [string.ascii_uppercase[i] for i in range(len(sensors_str))]

    # Some synthetic data
    original_input = np.random.random((10, len(tags)))
    model_output = np.random.random((10, len(tags)))

    # Convert this data into a dataframe with multi index columns
    df = model_utils.make_base_dataframe(tags, original_input, model_output)

    # Server then converts this into a dict which maps top level names to lists
    serialized = server_utils.dataframe_to_dict(df)

    # Client reproduces this dataframe
    df_clone = server_utils.dataframe_from_dict(serialized)

    # each subset of column under the top level names should be equal
    top_lvl_names = df.columns.get_level_values(0)
    for top_lvl_name in filter(lambda n: n not in ("start", "end"), top_lvl_names):
        assert np.allclose(df[top_lvl_name].values, df_clone[top_lvl_name].values)
Example #2
0
    def _create_anomaly_response(self, start_time: float = None):
        """
        Use the current ``X`` and ``y`` to create an anomaly specific response
        using the trained ML model's ``.anomaly()`` method.

        Parameters
        ----------
        start_time: Optional[float]
            Start time to use when timing the processing time of the request, will construct a new
            one if not provided.

        Returns
        -------
        flask.Response
            The formatted anomaly representation response object.
        """
        if start_time is None:
            start_time = timeit.default_timer()

        # To use this endpoint, we need a 'y' to calculate the errors.
        if g.y is None:
            message = {
                "message":
                "Cannot perform anomaly without 'y' to compare against."
            }
            return make_response((jsonify(message), 400))

        # Now create an anomaly dataframe from the base response dataframe
        try:
            anomaly_df = g.model.anomaly(g.X, g.y, frequency=self.frequency)
        except AttributeError:
            msg = {
                "message":
                f"Model is not an AnomalyDetector, it is of type: {type(g.model)}"
            }
            return make_response(jsonify(msg), 422)  # 422 Unprocessable Entity

        if request.args.get("all_columns") is None:
            columns_for_delete = []
            for column in anomaly_df:
                if column[0] in DELETED_FROM_RESPONSE_COLUMNS:
                    columns_for_delete.append(column)
            anomaly_df = anomaly_df.drop(columns=columns_for_delete)

        if request.args.get("format") == "parquet":
            return send_file(
                io.BytesIO(utils.dataframe_into_parquet_bytes(anomaly_df)),
                mimetype="application/octet-stream",
            )
        else:
            context: typing.Dict[typing.Any, typing.Any] = dict()
            context["data"] = utils.dataframe_to_dict(anomaly_df)
            context[
                "time-seconds"] = f"{timeit.default_timer() - start_time:.4f}"
            return make_response(jsonify(context),
                                 context.pop("status-code", 200))
Example #3
0
def test_dataframe_from_to_dict(df):
    """
    Test (de)serializations back and forth between dataframe -> dict -> dataframe
    """
    index_was_datetimes: bool = isinstance(df.index, pd.DatetimeIndex)

    cloned = server_utils.dataframe_from_dict(
        server_utils.dataframe_to_dict(df))

    if index_was_datetimes:
        # Ensure the function hasn't mutated the index.
        assert isinstance(df.index, pd.DatetimeIndex)

    assert np.allclose(df.values, cloned.values)
    assert df.columns.tolist() == cloned.columns.tolist()
    assert df.index.tolist() == cloned.index.tolist()
Example #4
0
def test_dataframe_from_dict_ordering(index):
    """
    We expect that from_dict should order based on the index, and will parse the index
    either as datetime or integers and sort in ascending order from there.
    """
    df = pd.DataFrame(np.random.random((10, 5)))
    df.index = index
    original = df.copy()

    # What we want
    if isinstance(original.index[0], str):
        # Parse as datetime or integers if index is string
        try:
            original.index = original.index.map(dateutil.parser.isoparse)
        except ValueError:
            original.index = original.index.map(int)
    original.sort_index(inplace=True)

    # What we get
    df_out = server_utils.dataframe_from_dict(
        server_utils.dataframe_to_dict(df))

    assert np.alltrue(df_out.index == original.index)
    assert np.alltrue(df_out.values == original.values)
Example #5
0
    def _send_prediction_request(
        self,
        X: pd.DataFrame,
        y: typing.Optional[pd.DataFrame],
        chunk: slice,
        machine: Machine,
        start: datetime,
        end: datetime,
        revision: str,
    ):
        """
        Post a slice of data to the machine

        Parameters
        ----------
        X: pandas.core.DataFrame
            The data for the model, in pandas representation
        chunk: slice
            The slice to take from DataFrame.iloc for the batch size
        machine: Machine
        start: datetime
        end: datetime

        Notes
        -----
        PredictionResult.predictions may be None if the prediction process fails

        Returns
        -------
        PredictionResult

        Raises
        -----
        ResourceGone
            If the sever returns a 410, most likely because the revision is too old
        """

        kwargs: Dict[str, Any] = dict(
            url=
            f"{self.base_url}/gordo/v0/{self.project_name}/{machine.name}{self.prediction_path}",
            params={
                "format": self.format,
                "revision": revision
            },
        )

        # We're going to serialize the data as either JSON or Arrow
        if self.use_parquet:
            kwargs["files"] = {
                "X":
                server_utils.dataframe_into_parquet_bytes(X.iloc[chunk]),
                "y":
                server_utils.dataframe_into_parquet_bytes(y.iloc[chunk])
                if y is not None else None,
            }
        else:
            kwargs["json"] = {
                "X":
                server_utils.dataframe_to_dict(X.iloc[chunk]),
                "y":
                server_utils.dataframe_to_dict(y.iloc[chunk])
                if y is not None else None,
            }

        # Start attempting to get predictions for this batch
        for current_attempt in itertools.count(start=1):
            try:
                try:
                    resp = _handle_response(self.session.post(**kwargs))
                except HttpUnprocessableEntity:
                    self.prediction_path = "/prediction"
                    kwargs[
                        "url"] = f"{self.base_url}/gordo/v0/{self.project_name}/{machine.name}{self.prediction_path}"
                    resp = _handle_response(self.session.post(**kwargs))
            # If it was an IO or TimeoutError, we can retry
            except (
                    IOError,
                    TimeoutError,
                    requests.ConnectionError,
                    requests.HTTPError,
            ) as exc:
                if current_attempt <= self.n_retries:
                    time_to_sleep = min(2**(current_attempt + 2), 300)
                    logger.warning(
                        f"Failed to get response on attempt {current_attempt} out of {self.n_retries} attempts."
                    )
                    sleep(time_to_sleep)
                    continue
                else:
                    msg = (
                        f"Failed to get predictions for dates {start} -> {end} "
                        f"for target: '{machine.name}' Error: {exc}")
                    logger.error(msg)

                    return PredictionResult(name=machine.name,
                                            predictions=None,
                                            error_messages=[msg])

            # No point in retrying a BadGordoRequest
            except (BadGordoRequest, NotFound) as exc:
                msg = (
                    f"Failed with bad request or not found for dates {start} -> {end} "
                    f"for target: '{machine.name}' Error: {exc}")
                logger.error(msg)
                return PredictionResult(name=machine.name,
                                        predictions=None,
                                        error_messages=[msg])
            except ResourceGone:
                raise

            # Process response and return if no exception
            else:

                predictions = self.dataframe_from_response(resp)

                # Forward predictions to any other consumer if registered.
                if self.prediction_forwarder is not None:
                    self.prediction_forwarder(  # type: ignore
                        predictions=predictions,
                        machine=machine,
                        metadata=self.metadata)
                return PredictionResult(name=machine.name,
                                        predictions=predictions,
                                        error_messages=[])
Example #6
0
    def post(self):
        """
        Process a POST request by using provided user data

        A typical response might look like this

        .. code-block:: python

            {
                'data': [
                    {
                        'end': ['2016-01-01T00:10:00+00:00'],
                        'model-output': [0.0005317790200933814,
                                         -0.0001525811239844188,
                                         0.0008310950361192226,
                                         0.0015755111817270517],
                        'original-input': [0.9135588550070414,
                                           0.3472517774179448,
                                           0.8994921857179736,
                                           0.11982773108991263],
                        'start': ['2016-01-01T00:00:00+00:00'],
                    },
                    ...
                ],

                'tags': [
                    {'asset': None, 'name': 'tag-0'},
                    {'asset': None, 'name': 'tag-1'},
                    {'asset': None, 'name': 'tag-2'},
                    {'asset': None, 'name': 'tag-3'}
                ],
                'time-seconds': '0.1937'
            }
        """
        context: typing.Dict[typing.Any, typing.Any] = dict()
        X = g.X
        process_request_start_time_s = timeit.default_timer()

        try:
            output = model_io.get_model_output(model=g.model, X=X)
        except ValueError as err:
            tb = traceback.format_exc()
            logger.error(
                f"Failed to predict or transform; error: {err} - \nTraceback: {tb}"
            )
            context["error"] = f"ValueError: {str(err)}"
            return make_response((jsonify(context), 400))

        # Model may only be a transformer, probably an AttributeError, but catch all to avoid logging other
        # exceptions twice if it happens.
        except Exception as exc:
            tb = traceback.format_exc()
            logger.error(
                f"Failed to predict or transform; error: {exc} - \nTraceback: {tb}"
            )
            context[
                "error"] = "Something unexpected happened; check your input data"
            return make_response((jsonify(context), 400))

        else:
            get_model_output_time_s = timeit.default_timer()
            logger.debug(
                f"Calculating model output took "
                f"{get_model_output_time_s-process_request_start_time_s} s")
            data = model_utils.make_base_dataframe(
                tags=self.tags,
                model_input=X.values if isinstance(X, pd.DataFrame) else X,
                model_output=output,
                target_tag_list=self.target_tags,
                index=X.index,
            )
            if request.args.get("format") == "parquet":
                return send_file(
                    io.BytesIO(
                        server_utils.dataframe_into_parquet_bytes(data)),
                    mimetype="application/octet-stream",
                )
            else:
                context["data"] = server_utils.dataframe_to_dict(data)
                return make_response(
                    (jsonify(context), context.pop("status-code", 200)))