def data_fetcher(self): return DataFetcher( DataSpec( time_series={ "ts1": TimeSeriesSpec(id=1234, start=3000, end=5000, aggregate="average", granularity="1s"), "ts2": TimeSeriesSpec(id=2345, start=3000, end=5000, aggregate="max", granularity="1s"), "ts3": TimeSeriesSpec(id=3456, start=4000, end=9000, aggregate="min", granularity="1s"), "ts4": TimeSeriesSpec(external_id="abc", start=3000, end=5000, aggregate="average", granularity="1m"), "ts5": TimeSeriesSpec(id=5678, start=6000, end=8000), }))
def train(open_artifact, data_spec): """ open_artifact: The train method must accept a open_artifact argument. This is a function that works the same way as the builtin open(), except it reads from and writes to the root of a special storage location in the cloud that belongs to the current model version. data_spec: An argument we pass in ourself when we initiate the training. api_key, project: Optional arguments that are passed in automatically from Model Hosting if you specify them. """ data_fetcher = DataFetcher(data_spec) data_fetcher.files.fetch("data") data_fetcher.files.fetch("target") X = pd.read_csv("data") y = pd.read_csv("target") # Add a feature of constant value 1 X.insert(0, "f0", 1) # Least squares coefficients = pd.DataFrame(np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y), columns=["beta_hat"]) # Persist our result with open_artifact("coefficients.csv", "w") as f: coefficients.to_csv(f, index=False)
def data_fetcher(self, file_ids) -> DataFetcher: return DataFetcher( DataSpec( files={ "a": FileSpec(id=file_ids["a.txt"]), "a_duplicate": FileSpec(id=file_ids["a.txt"]), "b": FileSpec(id=file_ids["b.txt"]), "big": FileSpec(id=file_ids["big.txt"]), }))
def data_fetcher(self, ts_ids, now) -> DataFetcher: one_hour_ago = now - 3600 * 1000 return DataFetcher( DataSpec( time_series={ "constant3": TimeSeriesSpec( id=ts_ids["constant_3"], start=one_hour_ago, end=now), "constant3_duplicate": TimeSeriesSpec( id=ts_ids["constant_3"], start=one_hour_ago, end=now), "constant3_avg_1s": TimeSeriesSpec(id=ts_ids["constant_3"], start=one_hour_ago, end=now, aggregate="average", granularity="1s"), "constant3_avg_1s_duplicate": TimeSeriesSpec(id=ts_ids["constant_3"], start=one_hour_ago, end=now, aggregate="average", granularity="1s"), "constant3_avg_1m": TimeSeriesSpec(id=ts_ids["constant_3"], start=one_hour_ago, end=now, aggregate="average", granularity="1m"), "constant3_max_1m": TimeSeriesSpec(id=ts_ids["constant_3"], start=one_hour_ago, end=now, aggregate="max", granularity="1m"), "constant4": TimeSeriesSpec( id=ts_ids["constant_4"], start=one_hour_ago, end=now), "constant4_avg_1s": TimeSeriesSpec(id=ts_ids["constant_4"], start=one_hour_ago, end=now, aggregate="average", granularity="1s"), "constant5_min_1s": TimeSeriesSpec(id=ts_ids["constant_5"], start=one_hour_ago, end=now, aggregate="min", granularity="1s"), "constant6_max_1s": TimeSeriesSpec(id=ts_ids["constant_6"], start=one_hour_ago, end=now, aggregate="max", granularity="1s"), }))
def test_fetch_datapoints_single_many_datapoints(self, ts_ids, now): data_fetcher = DataFetcher( DataSpec( time_series={ "constant3": TimeSeriesSpec(id=ts_ids["constant_3"], start=now - 48 * 3600 * 1000, end=now) })) df = data_fetcher.time_series.fetch_datapoints("constant3") self.assert_data_frame(df, ["value"], {"value": 3})
def test_fetch_datapoints_many_time_series(self, ts_ids, now): data_fetcher = DataFetcher( DataSpec( time_series={ "constant{}".format(i): TimeSeriesSpec( id=ts_ids["constant_{}".format(i)], start=now - 3600 * 1000, end=now) for i in range(100) })) dfs = data_fetcher.time_series.fetch_datapoints( ["constant{}".format(i) for i in range(100)]) for i in range(100): self.assert_data_frame(dfs["constant{}".format(i)], ["value"], {"value": i})
def test_fetch_datapoints_multiple_many_datapoints(self, ts_ids, now): data_fetcher = DataFetcher( DataSpec( time_series={ "constant3": TimeSeriesSpec(id=ts_ids["constant_3"], start=now - 48 * 3600 * 1000, end=now), "constant4": TimeSeriesSpec(id=ts_ids["constant_4"], start=now - 48 * 3600 * 1000, end=now), })) dfs = data_fetcher.time_series.fetch_datapoints( ["constant3", "constant4"]) self.assert_data_frame(dfs["constant3"], ["value"], {"value": 3}) self.assert_data_frame(dfs["constant4"], ["value"], {"value": 4})
def predict(self, instance): """ instance: Since we're doing scheduled prediction, this will be a data spec describing the data we should do prediction on. Note that it's also possible to take api_key and project in as optional arguments here. """ dts = DataFetcher(instance) df = dts.time_series.fetch_dataframe(["temp", "pressure", "rpm"]).dropna() X = df[["temp", "pressure", "rpm"]].values df["production_rate"] = self.regressor.predict(X) # For scheduled prediction we need to return output on the format: # { # "timeSeries": # { "production_rate": [(t0, p0), (t1, p1), (t2, p2), ...] } # } # We can use a model hosting utilities method to convert our dataframe # to this format. return to_output(df[["timestamp", "production_rate"]])
start=start, end=end, aggregate=aggregate, granularity=granularity), "gas_integ_time": TimeSeriesSpec(4988486819178408, start=start, end=end, aggregate=aggregate, granularity=granularity), "gas_gain": TimeSeriesSpec(3658191334677419, start=start, end=end), }) # Now lets fetch the data for our "gas_auto" and "gas_external" time series data_fetcher = DataFetcher(data_spec, api_key=os.getenv("COGNITE_OID_API_KEY"), project="publicdata") df = data_fetcher.time_series.fetch_dataframe(["gas_auto", "gas_external"]) print(df.head()) # When using fetch_dataframe all specified time series must have the same start, end, and granularity # To fetch data from times series with different specs, we can use the following method dfs = data_fetcher.time_series.fetch_datapoints(["gas_integ_time", "gas_gain"]) print(dfs["gas_gain"].head()) print(dfs["gas_integ_time"].head())
def predict(self, instance): data_fetcher = DataFetcher(instance) df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"]) df["y"] = df["x2"] / df["x1"] return to_output(df.size)
def predict(self, instance): data_fetcher = DataFetcher(instance) df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"]) df["y"] = (df["x1"] + df["x2"]) / math.pi return to_output(df[["y", "timestamp"]])
def data_fetcher(self, file_specs, rsps): data_spec = DataSpec(files=file_specs) data_fetcher = DataFetcher(data_spec) return data_fetcher
def test_get_data_spec(): data_spec = DataSpec(files={"f1": FileSpec(id=123)}) getted_data_spec = DataFetcher(data_spec).get_data_spec() getted_data_spec.files["f1"].id = 234 assert data_spec.files["f1"].id == 123
def test_invalid_spec_type(): with pytest.raises(SpecValidationError, match="has to be of type"): DataFetcher(123)
def test_empty_data_spec(rsps, data_spec): data_fetcher = DataFetcher(data_spec) assert data_fetcher.get_data_spec() == DataSpec()
end=end, aggregate=aggregate, granularity=granularity), "gas_integ_time": TimeSeriesSpec(id=4988486819178408, start=start, end=end, aggregate=aggregate, granularity=granularity), "gas_gain": TimeSeriesSpec(id=3658191334677419, start=start, end=end), }) # Now lets fetch the data for our "gas_auto" and "gas_external" time series data_fetcher = DataFetcher(data_spec, api_key=os.getenv("COGNITE_OID_API_KEY"), project="publicdata", client_name="test-client") df = data_fetcher.time_series.fetch_dataframe(["gas_auto", "gas_external"]) print(df.head()) # When using fetch_dataframe all specified time series must have the same start, end, and granularity # To fetch data from times series with different specs, we can use the following method dfs = data_fetcher.time_series.fetch_datapoints(["gas_integ_time", "gas_gain"]) print(dfs["gas_gain"].head()) print(dfs["gas_integ_time"].head())
def predict(self, instance): data_fetcher = DataFetcher(instance, client_name="simple-transform-client") df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"]) df["y"] = (df["x1"] + df["x2"]) / math.pi return to_output(df[["y"]])
def predict(self, instance): data_fetcher = DataFetcher(instance, client_name="cprfix-client") df = data_fetcher.time_series.fetch_dataframe(["x1", "x2"]) df["y"] = df["x2"] / df["x1"] return to_output(df[["y"]])