def test_feature_set_db(self): name = "stocks_test" stocks_set = fs.FeatureSet( name, entities=[Entity("ticker", ValueType.STRING)]) fs.preview( stocks_set, stocks, ) stocks_set.save() db = mlrun.get_run_db() sets = db.list_feature_sets(self.project_name, name) assert len(sets) == 1, "bad number of results" feature_set = fs.get_feature_set(name, self.project_name) assert feature_set.metadata.name == name, "bad feature set response" fs.ingest(stocks_set, stocks) with pytest.raises(mlrun.errors.MLRunPreconditionFailedError): fs.delete_feature_set(name, self.project_name) stocks_set.purge_targets() fs.delete_feature_set(name, self.project_name) sets = db.list_feature_sets(self.project_name, name) assert not sets, "Feature set should be deleted"
def test_check_permissions(): data = pd.DataFrame({ "time_stamp": [ pd.Timestamp("2021-06-09 09:30:06.008"), pd.Timestamp("2021-06-09 10:29:07.009"), pd.Timestamp("2021-06-09 09:29:08.010"), ], "data": [10, 20, 30], "string": ["ab", "cd", "ef"], }) data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")]) mlrun.db.FileRunDB.verify_authorization = unittest.mock.Mock( side_effect=mlrun.errors.MLRunAccessDeniedError("")) try: fs.preview( data_set1, data, entity_columns=[Entity("string")], timestamp_key="time_stamp", ) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.ingest(data_set1, data, infer_options=fs.InferOptions.default()) assert False except mlrun.errors.MLRunAccessDeniedError: pass features = ["fs1.*"] feature_vector = fs.FeatureVector("test", features) try: fs.get_offline_features(feature_vector, entity_timestamp_column="time_stamp") assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.get_online_feature_service(feature_vector) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.deploy_ingestion_service(featureset=data_set1) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: data_set1.purge_targets() assert False except mlrun.errors.MLRunAccessDeniedError: pass
def test_multiple_entities(self): name = f"measurements_{uuid.uuid4()}" current_time = pd.Timestamp.now() data = pd.DataFrame( { "time": [ current_time, current_time - pd.Timedelta(minutes=1), current_time - pd.Timedelta(minutes=2), current_time - pd.Timedelta(minutes=3), current_time - pd.Timedelta(minutes=4), current_time - pd.Timedelta(minutes=5), ], "first_name": ["moshe", None, "yosi", "yosi", "moshe", "yosi"], "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"], "bid": [2000, 10, 11, 12, 2500, 14], } ) # write to kv data_set = fs.FeatureSet( name, entities=[Entity("first_name"), Entity("last_name")] ) data_set.add_aggregation( name="bids", column="bid", operations=["sum", "max"], windows="1h", period="10m", emit_policy=EmitAfterMaxEvent(1), ) fs.preview( data_set, data, # source entity_columns=["first_name", "last_name"], timestamp_key="time", options=fs.InferOptions.default(), ) data_set.plot( str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True ) fs.ingest(data_set, data, return_df=True) features = [ f"{name}.bids_sum_1h", ] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "yosi", "last_name": "levi"}]) assert resp[0]["bids_sum_1h"] == 37.0 svc.close()
def test_split_graph(self): quotes_set = fs.FeatureSet("stock-quotes", entities=[fs.Entity("ticker")]) quotes_set.graph.to("MyMap", "somemap1", field="multi1", multiplier=3).to( "storey.Extend", _fn="({'extra': event['bid'] * 77})").to( "storey.Filter", "filter", _fn="(event['bid'] > 70)").to( FeaturesetValidator()) side_step_name = "side-step" quotes_set.graph.to("storey.Extend", name=side_step_name, _fn="({'extra2': event['bid'] * 17})") with pytest.raises(mlrun.errors.MLRunPreconditionFailedError): fs.preview(quotes_set, quotes) non_default_target_name = "side-target" quotes_set.set_targets( targets=[ CSVTarget(name=non_default_target_name, after_state=side_step_name) ], default_final_step="FeaturesetValidator", ) quotes_set.plot(with_targets=True) inf_out = fs.preview(quotes_set, quotes) ing_out = fs.ingest(quotes_set, quotes, return_df=True) default_file_path = quotes_set.get_target_path(TargetTypes.parquet) side_file_path = quotes_set.get_target_path(non_default_target_name) side_file_out = pd.read_csv(side_file_path) default_file_out = pd.read_parquet(default_file_path) self._split_graph_expected_default.set_index("ticker", inplace=True) assert all( self._split_graph_expected_default == default_file_out.round(2)) assert all(self._split_graph_expected_default == ing_out.round(2)) assert all(self._split_graph_expected_default == inf_out.round(2)) assert all( self._split_graph_expected_side.sort_index( axis=1) == side_file_out.sort_index(axis=1).round(2))
def _ingest_quotes_featureset(self): quotes_set = FeatureSet("stock-quotes", entities=[Entity("ticker")]) flow = quotes_set.graph flow.to("MyMap", multiplier=3).to( "storey.Extend", _fn="({'z': event['bid'] * 77})" ).to("storey.Filter", "filter", _fn="(event['bid'] > 51.92)").to( FeaturesetValidator() ) quotes_set.add_aggregation("asks1", "ask", ["sum", "max"], "1h", "10m") quotes_set.add_aggregation("asks2", "ask", ["sum", "max"], "5h", "10m") quotes_set.add_aggregation("bids", "bid", ["min", "max"], "1h", "10m") df = fs.preview( quotes_set, quotes, entity_columns=["ticker"], timestamp_key="time", options=fs.InferOptions.default(), ) self._logger.info(f"quotes spec: {quotes_set.spec.to_yaml()}") assert df["zz"].mean() == 9, "map didnt set the zz column properly" quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info") quotes_set.plot( str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True ) df = fs.ingest(quotes_set, quotes, return_df=True) self._logger.info(f"output df:\n{df}") assert quotes_set.status.stats.get("asks1_sum_1h"), "stats not created"
def verify_ingest( base_data, keys, infer=False, targets=None, infer_options=fs.InferOptions.default() ): if isinstance(keys, str): keys = [keys] feature_set = fs.FeatureSet("my-feature-set") if infer: data = base_data.copy() fs.preview(feature_set, data, entity_columns=keys) else: data = base_data.set_index(keys=keys) if targets: feature_set.set_targets(targets=targets, with_defaults=False) df = fs.ingest(feature_set, data, infer_options=infer_options) assert len(df) == len(data) if infer: data.set_index(keys=keys, inplace=True) for idx in range(len(df)): assert all(df.values[idx] == data.values[idx])