def test_multiple_entities(self): name = f"measurements_{uuid.uuid4()}" current_time = pd.Timestamp.now() data = pd.DataFrame( { "time": [ current_time, current_time - pd.Timedelta(minutes=1), current_time - pd.Timedelta(minutes=2), current_time - pd.Timedelta(minutes=3), current_time - pd.Timedelta(minutes=4), current_time - pd.Timedelta(minutes=5), ], "first_name": ["moshe", "yosi", "yosi", "yosi", "moshe", "yosi"], "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"], "bid": [2000, 10, 11, 12, 2500, 14], } ) # write to kv data_set = fs.FeatureSet( name, entities=[Entity("first_name"), Entity("last_name")] ) data_set.add_aggregation( name="bids", column="bid", operations=["sum", "max"], windows="1h", period="10m", emit_policy=EmitAfterMaxEvent(1), ) fs.infer_metadata( data_set, data, # source entity_columns=["first_name", "last_name"], timestamp_key="time", options=fs.InferOptions.default(), ) data_set.plot( str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True ) fs.ingest(data_set, data, return_df=True) features = [ f"{name}.bids_sum_1h", ] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "yosi", "last_name": "levi"}]) assert resp[0]["bids_sum_1h"] == 47.0 svc.close()
def test_split_graph(self): quotes_set = fs.FeatureSet("stock-quotes", entities=[fs.Entity("ticker")]) quotes_set.graph.to("MyMap", "somemap1", field="multi1", multiplier=3).to( "storey.Extend", _fn="({'extra': event['bid'] * 77})").to( "storey.Filter", "filter", _fn="(event['bid'] > 70)").to( FeaturesetValidator()) side_step_name = "side-step" quotes_set.graph.to("storey.Extend", name=side_step_name, _fn="({'extra2': event['bid'] * 17})") with pytest.raises(mlrun.errors.MLRunPreconditionFailedError): fs.infer_metadata(quotes_set, quotes) non_default_target_name = "side-target" quotes_set.set_targets( targets=[ CSVTarget(name=non_default_target_name, after_state=side_step_name) ], default_final_state="FeaturesetValidator", ) quotes_set.plot(with_targets=True) inf_out = fs.infer_metadata(quotes_set, quotes) ing_out = fs.ingest(quotes_set, quotes, return_df=True) default_file_path = quotes_set.get_target_path(TargetTypes.parquet) side_file_path = quotes_set.get_target_path(non_default_target_name) side_file_out = pd.read_csv(side_file_path) default_file_out = pd.read_parquet(default_file_path) self._split_graph_expected_default.set_index("ticker", inplace=True) assert all( self._split_graph_expected_default == default_file_out.round(2)) assert all(self._split_graph_expected_default == ing_out.round(2)) assert all(self._split_graph_expected_default == inf_out.round(2)) assert all( self._split_graph_expected_side.sort_index( axis=1) == side_file_out.sort_index(axis=1).round(2))
def _ingest_quotes_featureset(self): quotes_set = FeatureSet("stock-quotes", entities=[Entity("ticker")]) flow = quotes_set.graph flow.to("MyMap", multiplier=3).to("storey.Extend", _fn="({'z': event['bid'] * 77})").to( "storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( FeaturesetValidator()) quotes_set.add_aggregation("asks", "ask", ["sum", "max"], ["1h", "5h"], "10m") quotes_set.add_aggregation("bids", "bid", ["min", "max"], ["1h"], "10m") df = fs.infer_metadata( quotes_set, quotes, entity_columns=["ticker"], timestamp_key="time", options=fs.InferOptions.default(), ) self._logger.info(f"quotes spec: {quotes_set.spec.to_yaml()}") assert df["zz"].mean() == 9, "map didnt set the zz column properly" quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info") quotes_set.plot(str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True) df = fs.ingest(quotes_set, quotes, return_df=True) self._logger.info(f"output df:\n{df}") assert quotes_set.status.stats.get("asks_sum_1h"), "stats not created"
def test_serverless_ingest(): init_store() measurements = fs.FeatureSet("measurements", entities=[Entity("patient_id")]) src_df = pd.read_csv(local_dir + "testdata.csv") df = fs.infer_metadata( measurements, src_df, timestamp_key="timestamp", options=fs.InferOptions.default(), ) print(df.head(5)) target_path = os.path.relpath(results_dir + "mycsv.csv") source = CSVSource("mycsv", path=os.path.relpath(local_dir + "testdata.csv")) targets = [CSVTarget("mycsv", path=target_path)] if os.path.exists(target_path): os.remove(target_path) run_ingestion_task( measurements, source, targets, name="test_ingest", infer_options=fs.InferOptions.Null, parameters={}, function=None, local=True, ) assert os.path.exists(target_path), "result file was not generated"
def test_feature_set_db(self): name = "stocks_test" stocks_set = fs.FeatureSet(name, entities=[Entity("ticker", ValueType.STRING)]) fs.infer_metadata( stocks_set, stocks, ) stocks_set.save() db = mlrun.get_run_db() sets = db.list_feature_sets(self.project_name, name) assert len(sets) == 1, "bad number of results" feature_set = fs.get_feature_set(name, self.project_name) assert feature_set.metadata.name == name, "bad feature set response" fs.delete_feature_set(name, self.project_name) sets = db.list_feature_sets(self.project_name, name) assert not sets, "Feature set should be deleted"
def test_feature_set_db(): init_store() name = "stocks_test" stocks_set = fs.FeatureSet(name, entities=[Entity("ticker", ValueType.STRING)]) fs.infer_metadata( stocks_set, stocks, ) stocks_set.save() db = mlrun.get_run_db() sets = db.list_feature_sets("", name) assert len(sets) == 1, "bad number of results" feature_set = db.get_feature_set(name) assert feature_set.metadata.name == name, "bad feature set response"
def verify_ingest( base_data, keys, infer=False, targets=None, infer_options=fs.InferOptions.default() ): if isinstance(keys, str): keys = [keys] feature_set = fs.FeatureSet("my-feature-set") if infer: data = base_data.copy() fs.infer_metadata(feature_set, data, entity_columns=keys) else: data = base_data.set_index(keys=keys) if targets: feature_set.set_targets(targets=targets, with_defaults=False) df = fs.ingest(feature_set, data, infer_options=infer_options) assert len(df) == len(data) if infer: data.set_index(keys=keys, inplace=True) for idx in range(len(df)): assert all(df.values[idx] == data.values[idx])