def _raw_data(self, endpoint: EndpointMetadata, start: datetime, end: datetime) -> pd.DataFrame: """ Fetch the required raw data in this time range which would satisfy this endpoint's /prediction POST Parameters ---------- endpoint: EndpointMetadata Named tuple representing the endpoint info from Watchman start: datetime end: datetime Returns ------- pandas.core.DataFrame Dataframe of required tags and index reflecting the datetime point """ # We want to adjust for any model offset. If the model outputs less than it got in, it requires # extra data than what we're being asked to get predictions for. # just to give us some buffer zone. start = self._adjust_for_offset(dt=start, resolution=endpoint.resolution, n_intervals=endpoint.model_offset + 5) dataset = TimeSeriesDataset( data_provider=self.data_provider, # type: ignore from_ts=start, to_ts=end, resolution=endpoint.resolution, tag_list=endpoint.tag_list, target_tag_list=endpoint.target_tag_list, ) return dataset.get_data()
def get(self): """ Process a GET request by fetching data ourselves """ context = dict() # type: typing.Dict[str, typing.Any] context["status-code"] = 200 params = request.get_json() or request.args if not all(k in params for k in ("start", "end")): message = dict( message="must provide iso8601 formatted dates with timezone-information for parameters 'start' and 'end'" ) return make_response((jsonify(message), 400)) try: start = self._parse_iso_datetime(params["start"]) end = self._parse_iso_datetime(params["end"]) except ValueError: logger.error( f"Failed to parse start and/or end date to ISO: start: " f"{params['start']} - end: {params['end']}" ) message = dict( message="Could not parse start/end date(s) into ISO datetime. must provide iso8601 formatted dates for both." ) return make_response((jsonify(message), 400)) # Make request time span of one day if (end - start).days: message = dict(message="Need to request a time span less than 24 hours.") return make_response((jsonify(message), 400)) logger.debug("Fetching data from data provider") before_data_fetch = timeit.default_timer() dataset = TimeSeriesDataset( data_provider=g.data_provider, from_ts=start - self.frequency.delta, to_ts=end, resolution=current_app.metadata["dataset"]["resolution"], tag_list=self.tags, ) X, _y = dataset.get_data() logger.debug( f"Fetching data from data provider took " f"{timeit.default_timer()-before_data_fetch} seconds" ) # Want resampled buckets equal or greater than start, but less than end # b/c if end == 00:00:00 and req = 10 mins, a resampled bucket starting # at 00:00:00 would imply it has data until 00:10:00; which is passed # the requested end datetime X = X[ (X.index > start - self.frequency.delta) & (X.index + self.frequency.delta < end) ] return self._process_request(context=context, X=X)
def test_faked_DataLakeBackedDataset(self, _mocked_method): config = dict( from_ts=dateutil.parser.isoparse("2014-07-01T00:10:00+00:00"), to_ts=dateutil.parser.isoparse("2015-01-01T00:00:00+00:00"), tag_list=[ "asgb.19ZT3950%2FY%2FPRIM", "asgb.19PST3925%2FDispMeasOut%2FPRIM", ], ) provider = DataLakeProvider(storename="dataplatformdlsprod", interactive=True) dataset = TimeSeriesDataset(data_provider=provider, **config) # Should be able to call get_data without being asked to authenticate in tests X, y = dataset.get_data()
def test_timeseries_target_tags(tag_list, target_tag_list): start = dateutil.parser.isoparse("2017-12-25 06:00:00Z") end = dateutil.parser.isoparse("2017-12-29 06:00:00Z") tsd = TimeSeriesDataset(MockDataSource(), start, end, tag_list, target_tag_list) X, y = tsd.get_data() # If we have targets, y and X should be equal length and axis=1 == N in target tag list if target_tag_list: assert len(X) == len(y) assert y.shape[1] == len(target_tag_list) # Ensure the order in maintained assert [tag.name for tag in target_tag_list] == y.columns.tolist() else: assert y is None # Features should match the tag_list assert X.shape[1] == len(tag_list) # Ensure the order in maintained assert [tag.name for tag in tag_list] == X.columns.tolist()
def wrapper_method(self, *args, **kwargs): # Data provided by the client if request.method == "POST": X = request.json.get("X") y = request.json.get("y") if X is None: message = dict(message='Cannot predict without "X"') return make_response((jsonify(message), 400)) # Convert X and (maybe) y into dataframes. X = dataframe_from_dict(X, tags=list(tag.name for tag in self.tags), name="X") # Y is ok to be None for BaseView, view(s) like Anomaly might require it. if y is not None and self.target_tags: y = dataframe_from_dict(y, list(tag.name for tag in self.target_tags), name="y") # If either X or y came back as a Response type, there was an error for data_or_resp in [X, y]: if isinstance(data_or_resp, Response): return data_or_resp # Data must be queried from Influx given dates passed in request. elif request.method == "GET": params = request.get_json() or request.args if not all(k in params for k in ("start", "end")): message = dict( message= "must provide iso8601 formatted dates with timezone-information for parameters 'start' and 'end'" ) return make_response((jsonify(message), 400)) # Extract the dates from parameters try: start = parse_iso_datetime(params["start"]) end = parse_iso_datetime(params["end"]) except ValueError: logger.error( f"Failed to parse start and/or end date to ISO: start: " f"{params['start']} - end: {params['end']}") message = dict( message= "Could not parse start/end date(s) into ISO datetime. must provide iso8601 formatted dates for both." ) return make_response((jsonify(message), 400)) # Make request time span of one day if (end - start).days: message = dict( message="Need to request a time span less than 24 hours.") return make_response((jsonify(message), 400)) logger.debug("Fetching data from data provider") before_data_fetch = timeit.default_timer() dataset = TimeSeriesDataset( data_provider=g.data_provider, from_ts=start - self.frequency.delta, to_ts=end, resolution=current_app.metadata["dataset"]["resolution"], tag_list=self.tags, target_tag_list=self.target_tags or None, ) X, y = dataset.get_data() logger.debug(f"Fetching data from data provider took " f"{timeit.default_timer()-before_data_fetch} seconds") # Want resampled buckets equal or greater than start, but less than end # b/c if end == 00:00:00 and req = 10 mins, a resampled bucket starting # at 00:00:00 would imply it has data until 00:10:00; which is passed # the requested end datetime X = X[(X.index > start - self.frequency.delta) & (X.index + self.frequency.delta < end)] # TODO: Remove/rework this once we add target_tags assignments in workflow generator for autoencoders. if y is None: y = X.copy() else: y = y.loc[X.index] else: raise NotImplementedError( f"Cannot extract X and y from '{request.method}' request.") # Assign X and y to the request's global context g.X, g.y = X, y # And run the original method. return method(self, *args, **kwargs)
def get(self): context = dict() # type: typing.Dict[str, typing.Any] context["status-code"] = 200 start_time = timeit.default_timer() params = request.get_json() or request.args if not all(k in params for k in ("start", "end")): return ( { "error": "must provide iso8601 formatted dates with " "timezone-information for parameters 'start' and 'end'" }, 400, ) try: start = self._parse_iso_datetime(params["start"]) end = self._parse_iso_datetime(params["end"]) except ValueError: logger.error( f"Failed to parse start and/or end date to ISO: start: " f"{params['start']} - end: {params['end']}") return ( { "error": "Could not parse start/end date(s) into ISO datetime. " "must provide iso8601 formatted dates for both." }, 400, ) # Make request time span of one day if (end - start).days: return { "error": "Need to request a time span less than 24 hours." }, 400 freq = pd.tseries.frequencies.to_offset( current_app.metadata["dataset"]["resolution"]) dataset = TimeSeriesDataset( data_provider=g.data_provider, from_ts=start - freq.delta, to_ts=end, resolution=current_app.metadata["dataset"]["resolution"], tag_list=sensor_tag.normalize_sensor_tags( current_app.metadata["dataset"]["tag_list"]), ) X, _y = dataset.get_data() # Want resampled buckets equal or greater than start, but less than end # b/c if end == 00:00:00 and req = 10 mins, a resampled bucket starting # at 00:00:00 would imply it has data until 00:10:00; which is passed # the requested end datetime X = X[(X.index > start - freq.delta) & (X.index + freq.delta < end)] try: xhat = self.get_predictions(X).tolist() # Model may only be a transformer, probably an AttributeError, but catch all to avoid logging other # exceptions twice if it happens. except Exception as exc: logger.critical(f"Failed to predict or transform; error: {exc}") return ( { "error": "Something unexpected happened; check your input data" }, 400, ) # In GET requests we need to pair the resulting predictions with their # specific timestamp and additionally match the predictions to the corresponding tags. data = [] # This tags list is just for display/informative purposes, skipping the asset tags = [ tag["name"] for tag in current_app.metadata["dataset"]["tag_list"] ] for prediction, time_stamp in zip(xhat, X.index[-len(xhat):]): # Auto encoders return double their input. # First half is input to model, second half is output of model tag_inputs = np.array(prediction[:len(tags)]) tag_outputs = np.array(prediction[len(tags):]) tag_errors = np.abs(tag_inputs - tag_outputs) data.append({ "start": f"{time_stamp}", "end": f"{time_stamp + freq}", "tags": {tag: error for tag, error in zip(tags, tag_errors)}, "total_anomaly": np.linalg.norm(tag_inputs - tag_outputs), }) context["output"] = data context["time-seconds"] = f"{timeit.default_timer() - start_time:.4f}" return context, context["status-code"]