def test_second_anomaly_prediction_endpoint_all_columns( second_base_route, sensors_str, influxdb, gordo_ml_server_client, sensors, resp_format, ): data_to_post = { "X": np.random.random(size=(10, len(sensors_str))).tolist(), "y": np.random.random(size=(10, len(sensors_str))).tolist(), } endpoint = ( f"{second_base_route}/anomaly/prediction?all_columns=yes&format={resp_format}" ) resp = gordo_ml_server_client.post(endpoint, json=data_to_post) assert resp.status_code == 200 if resp_format in (None, "json"): assert "data" in resp.json data = server_utils.dataframe_from_dict(resp.json["data"]) else: data = server_utils.dataframe_from_parquet_bytes(resp.data) assert "smooth-tag-anomaly-scaled" in data assert "smooth-tag-anomaly-unscaled" in data assert "smooth-total-anomaly-scaled" in data assert "smooth-total-anomaly-unscaled" in data
def test_ml_server_dataframe_to_dict_and_back(sensors_str, use_test_project_tags): """ Tests the flow of the server creating a dataframe from the model's data, putting into a dict of string to df. lists of values, and the client being able to reconstruct it back to the original dataframe (less the second level names) """ # Run test with test project tag names if use_test_project_tags: tags = sensors_str # Run project with random names else: tags = [string.ascii_uppercase[i] for i in range(len(sensors_str))] # Some synthetic data original_input = np.random.random((10, len(tags))) model_output = np.random.random((10, len(tags))) # Convert this data into a dataframe with multi index columns df = model_utils.make_base_dataframe(tags, original_input, model_output) # Server then converts this into a dict which maps top level names to lists serialized = server_utils.dataframe_to_dict(df) # Client reproduces this dataframe df_clone = server_utils.dataframe_from_dict(serialized) # each subset of column under the top level names should be equal top_lvl_names = df.columns.get_level_values(0) for top_lvl_name in filter(lambda n: n not in ("start", "end"), top_lvl_names): assert np.allclose(df[top_lvl_name].values, df_clone[top_lvl_name].values)
def test_dataframe_to_from_dict(expect_multi_lvl: bool, data: dict): """ Creating dataframes from various raw data structures should have determined behavior such as not creating MultiIndex columns with a dict of simple key to array mappings. """ df = server_utils.dataframe_from_dict(data) if expect_multi_lvl: assert isinstance(df.columns, pd.MultiIndex) else: assert not isinstance(df.columns, pd.MultiIndex)
def test_anomaly_prediction_endpoint( base_route, sensors_str, influxdb, gordo_ml_server_client, data_size, sensors, resp_format, ): """ Anomaly GET and POST responses are the same """ data_to_post = { "X": np.random.random(size=(data_size, len(sensors_str))).tolist(), "y": np.random.random(size=(data_size, len(sensors_str))).tolist(), } endpoint = f"{base_route}/anomaly/prediction" if resp_format is not None: endpoint += f"?format={resp_format}" resp = gordo_ml_server_client.post(endpoint, json=data_to_post) # From here, the response should be (pretty much) the same format from GET or POST assert resp.status_code == 200 if resp_format in (None, "json"): assert "data" in resp.json data = server_utils.dataframe_from_dict(resp.json["data"]) else: data = server_utils.dataframe_from_parquet_bytes(resp.data) # Only different between POST and GET is POST will return None for # start and end dates, because the server can't know what those are assert "start" in data assert "end" in data if data_to_post is not None: assert np.all(data["start"].isna()) assert np.all(data["end"].isna()) else: assert not np.any(data["start"].isna()) assert not np.any(data["end"].isna()) assert all(key in data for key in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", "model-input", "model-output", ))
def test_dataframe_from_to_dict(df): """ Test (de)serializations back and forth between dataframe -> dict -> dataframe """ index_was_datetimes: bool = isinstance(df.index, pd.DatetimeIndex) cloned = server_utils.dataframe_from_dict( server_utils.dataframe_to_dict(df)) if index_was_datetimes: # Ensure the function hasn't mutated the index. assert isinstance(df.index, pd.DatetimeIndex) assert np.allclose(df.values, cloned.values) assert df.columns.tolist() == cloned.columns.tolist() assert df.index.tolist() == cloned.index.tolist()
def test_prediction_endpoint_post_ok( base_route, sensors, sensors_str, gordo_ml_server_client, data_size, to_dict_arg, resp_format, send_as_parquet, ): """ Test the expected successful data posts, by sending a variety of valid JSON formats of a dataframe, as well as parquet serializations. """ data_to_post = np.random.random(size=(data_size, len(sensors))).tolist() if to_dict_arg is not None: df = pd.DataFrame(data_to_post, columns=sensors_str) data_to_post = df.to_dict(to_dict_arg) endpoint = f"{base_route}/prediction" if resp_format is not None: endpoint += f"?format={resp_format}" if send_as_parquet: X = pd.DataFrame.from_dict(data_to_post) kwargs = dict(data={ "X": (io.BytesIO(server_utils.dataframe_into_parquet_bytes(X)), "X") }) else: kwargs = dict(json={"X": data_to_post}) resp = gordo_ml_server_client.post(endpoint, **kwargs) assert resp.status_code == 200 if resp_format in (None, "json"): data = server_utils.dataframe_from_dict(resp.json["data"]) else: data = server_utils.dataframe_from_parquet_bytes(resp.data) # Expected column names assert all(key in data for key in ("model-output", "model-input"))
def dataframe_from_response( response: typing.Union[dict, bytes]) -> pd.DataFrame: """ The response from the server, parsed as either JSON / dict or raw bytes, of which would be expected to be loadable from :func:`server.utils.dataframe_from_parquet_bytes` Parameters ---------- response: Union[dict, bytes] The parsed response from the ML server. Returns ------- pandas.DataFrame """ if isinstance(response, dict): predictions = server_utils.dataframe_from_dict(response["data"]) else: predictions = server_utils.dataframe_from_parquet_bytes(response) return predictions
def test_dataframe_from_dict_ordering(index): """ We expect that from_dict should order based on the index, and will parse the index either as datetime or integers and sort in ascending order from there. """ df = pd.DataFrame(np.random.random((10, 5))) df.index = index original = df.copy() # What we want if isinstance(original.index[0], str): # Parse as datetime or integers if index is string try: original.index = original.index.map(dateutil.parser.isoparse) except ValueError: original.index = original.index.map(int) original.sort_index(inplace=True) # What we get df_out = server_utils.dataframe_from_dict( server_utils.dataframe_to_dict(df)) assert np.alltrue(df_out.index == original.index) assert np.alltrue(df_out.values == original.values)