Example #1
0
def test_dataframe_from_dict(data: dict, tags: List[str], expect_success: bool):
    """
    Data can get into the server in a number of ways, basically any format
    that pandas.DataFrame.from_dict() supports. It opens for good flexibility
    we just need to test the function it is used in will 'fail' correctly or
    will reassign column names to tag names if needed.
    """
    app = flask.Flask(__name__)

    with app.app_context():
        result: Union[flask.Response, pd.DataFrame] = utils.dataframe_from_dict(
            data, tags, name="TEST"
        )

    if expect_success:

        assert isinstance(result, pd.DataFrame)

        # Check expected column names, if equal, or subset of tags
        if len(result.columns) == len(tags):
            assert result.columns.tolist() == tags
        else:
            assert all(col in tags for col in result.columns)
    else:

        # If it's not a dataframe it should be a response.
        assert isinstance(result, flask.Response)

        # Return a failed client request
        assert 400 <= result.status_code <= 499

        # Should have the name of the data being parsed in the error message
        assert "TEST" in result.data.decode()
Example #2
0
def test_prediction_endpoint_post_ok(
    base_route,
    sensors,
    gordo_ml_server_client,
    data_to_post,
    resp_format,
    send_as_parquet,
):
    """
    Test the expected successful data posts, by sending a variety of valid
    JSON formats of a dataframe, as well as parquet serializations.
    """
    endpoint = f"{base_route}/prediction"
    if resp_format is not None:
        endpoint += f"?format={resp_format}"

    if send_as_parquet:
        X = pd.DataFrame.from_dict(data_to_post)
        kwargs = dict(data={
            "X": (io.BytesIO(server_utils.dataframe_into_parquet_bytes(X)),
                  "X")
        })
    else:
        kwargs = dict(json={"X": data_to_post})

    resp = gordo_ml_server_client.post(endpoint, **kwargs)
    assert resp.status_code == 200

    if resp_format in (None, "json"):
        data = server_utils.dataframe_from_dict(resp.json["data"])
    else:
        data = server_utils.dataframe_from_parquet_bytes(resp.data)

    # Expected column names
    assert all(key in data for key in ("model-output", "model-input"))
Example #3
0
def test_anomaly_prediction_endpoint(base_route, influxdb,
                                     gordo_ml_server_client, data_to_post,
                                     sensors, resp_format):
    """
    Anomaly GET and POST responses are the same
    """
    endpoint = f"{base_route}/anomaly/prediction"
    if resp_format is not None:
        endpoint += f"?format={resp_format}"

    resp = gordo_ml_server_client.post(endpoint, json=data_to_post)

    # From here, the response should be (pretty much) the same format from GET or POST
    assert resp.status_code == 200
    if resp_format in (None, "json"):
        assert "data" in resp.json
        data = server_utils.dataframe_from_dict(resp.json["data"])
    else:
        data = server_utils.dataframe_from_parquet_bytes(resp.data)

    # Only different between POST and GET is POST will return None for
    # start and end dates, because the server can't know what those are
    assert "start" in data
    assert "end" in data
    if data_to_post is not None:
        assert np.all(data["start"].isna())
        assert np.all(data["end"].isna())
    else:
        assert not np.any(data["start"].isna())
        assert not np.any(data["end"].isna())

    assert all(key in data for key in ("total-anomaly", "tag-anomaly",
                                       "model-input", "model-output"))
Example #4
0
def test_dataframe_to_from_dict(expect_multi_lvl: bool, data: dict):
    """
    Creating dataframes from various raw data structures should have determined behavior
    such as not creating MultiIndex columns with a dict of simple key to array mappings.
    """
    df = server_utils.dataframe_from_dict(data)
    if expect_multi_lvl:
        assert isinstance(df.columns, pd.MultiIndex)
    else:
        assert not isinstance(df.columns, pd.MultiIndex)
Example #5
0
def test_dataframe_from_to_dict(df):
    """
    Test (de)serializations back and forth between dataframe -> dict -> dataframe
    """
    index_was_datetimes: bool = isinstance(df.index, pd.DatetimeIndex)

    cloned = server_utils.dataframe_from_dict(
        server_utils.dataframe_to_dict(df))

    if index_was_datetimes:
        # Ensure the function hasn't mutated the index.
        assert isinstance(df.index, pd.DatetimeIndex)

    assert np.allclose(df.values, cloned.values)
    assert df.columns.tolist() == cloned.columns.tolist()
    assert df.index.tolist() == cloned.index.tolist()
Example #6
0
 def dataframe_from_response(response: typing.Union[dict, bytes]) -> pd.DataFrame:
     """
     The response from the server, parsed as either JSON / dict or raw bytes,
     of which would be expected to be loadable from :func:`server.utils.dataframe_from_parquet_bytes`
     Parameters
     ----------
     response: Union[dict, bytes]
         The parsed response from the ML server.
     Returns
     -------
     pandas.DataFrame
     """
     if isinstance(response, dict):
         predictions = server_utils.dataframe_from_dict(response["data"])
     else:
         predictions = server_utils.dataframe_from_parquet_bytes(response)
     return predictions
def test_ml_server_dataframe_to_dict_and_back(tags: typing.List[str]):
    """
    Tests the flow of the server creating a dataframe from the model's data, putting into
    a dict of string to df. lists of values, and the client being able to reconstruct it back
    to the original dataframe (less the second level names)
    """
    # Some synthetic data
    original_input = np.random.random((10, len(tags)))
    model_output = np.random.random((10, len(tags)))

    # Convert this data into a dataframe with multi index columns
    df = model_utils.make_base_dataframe(tags, original_input, model_output)

    # Server then converts this into a dict which maps top level names to lists
    serialized = server_utils.dataframe_to_dict(df)

    # Client reproduces this dataframe
    df_clone = server_utils.dataframe_from_dict(serialized)

    # each subset of column under the top level names should be equal
    top_lvl_names = df.columns.get_level_values(0)
    for top_lvl_name in filter(lambda n: n not in ("start", "end"), top_lvl_names):
        assert np.allclose(df[top_lvl_name].values, df_clone[top_lvl_name].values)