def test_override_false(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) df3 = pd.concat([df1, df2]) fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")]) fs.ingest(fset, df1) features = ["override-false.*"] fvec = fs.FeatureVector("override-false-vec", features=features) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) fs.ingest(fset, df2, overwrite=False) off2 = fs.get_offline_features(fvec).to_dataframe() assert df3.set_index(keys="name").sort_index().equals(off2.sort_index()) fs.ingest(fset, df1, targets=[ParquetTarget()]) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close() with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False) fset.set_targets(targets=[CSVTarget()]) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, overwrite=False)
def test_purge(self): key = "patient_id" fset = fs.FeatureSet("purge", entities=[Entity(key)], timestamp_key="timestamp") path = os.path.relpath(str(self.assets_path / "testdata.csv")) source = CSVSource( "mycsv", path=path, time_field="timestamp", ) targets = [ CSVTarget(), CSVTarget(name="specified-path", path="v3io:///bigdata/csv-purge-test.csv"), ParquetTarget(partitioned=True, partition_cols=["timestamp"]), NoSqlTarget(), ] fset.set_targets( targets=targets, with_defaults=False, ) fs.ingest(fset, source) verify_purge(fset, targets) fs.ingest(fset, source) targets_to_purge = targets[:-1] verify_purge(fset, targets_to_purge)
def test_read_csv(self): from storey import ReadCSV, ReduceToDataFrame, build_flow csv_path = str(self.results_path / _generate_random_name() / ".csv") targets = [CSVTarget("mycsv", path=csv_path)] stocks_set = fs.FeatureSet( "tests", entities=[Entity("ticker", ValueType.STRING)]) fs.ingest(stocks_set, stocks, infer_options=fs.InferOptions.default(), targets=targets) # reading csv file controller = build_flow([ReadCSV(csv_path), ReduceToDataFrame()]).run() termination_result = controller.await_termination() expected = pd.DataFrame({ 0: ["ticker", "MSFT", "GOOG", "AAPL"], 1: ["name", "Microsoft Corporation", "Alphabet Inc", "Apple Inc"], 2: ["exchange", "NASDAQ", "NASDAQ", "NASDAQ"], }) assert termination_result.equals( expected), f"{termination_result}\n!=\n{expected}" os.remove(csv_path)
def test_serverless_ingest(self): key = "patient_id" measurements = fs.FeatureSet("measurements", entities=[Entity(key)], timestamp_key="timestamp") target_path = os.path.relpath(str(self.results_path / "mycsv.csv")) source = CSVSource("mycsv", path=os.path.relpath( str(self.assets_path / "testdata.csv"))) targets = [CSVTarget("mycsv", path=target_path)] if os.path.exists(target_path): os.remove(target_path) fs.ingest( measurements, source, targets, infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats, run_config=fs.RunConfig(local=True), ) assert os.path.exists(target_path), "result file was not generated" features = sorted(measurements.spec.features.keys()) stats = sorted(measurements.status.stats.keys()) print(features) print(stats) stats.remove("timestamp") assert features == stats, "didnt infer stats for all features"
def test_serverless_ingest(): init_store() key = "patient_id" measurements = fs.FeatureSet("measurements", entities=[Entity(key)], timestamp_key="timestamp") target_path = os.path.relpath(results_dir + "mycsv.csv") source = CSVSource("mycsv", path=os.path.relpath(local_dir + "testdata.csv")) targets = [CSVTarget("mycsv", path=target_path)] if os.path.exists(target_path): os.remove(target_path) run_ingestion_job( measurements, source, targets, name="test_ingest", infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats, parameters={}, function=None, local=True, ) assert os.path.exists(target_path), "result file was not generated" features = sorted(measurements.spec.features.keys()) stats = sorted(measurements.status.stats.keys()) print(features) print(stats) stats.remove("timestamp") assert features == stats, "didnt infer stats for all features" print(measurements.to_yaml())
def test_csv_parquet_index_alignment(self): targets = [CSVTarget()] csv_align_set, _ = prepare_feature_set("csv-align", "ticker", quotes, timestamp_key="time", targets=targets) csv_df = csv_align_set.to_dataframe() features = ["csv-align.*"] csv_vec = fs.FeatureVector("csv-align-vector", features) resp = fs.get_offline_features(csv_vec) csv_vec_df = resp.to_dataframe() targets = [ParquetTarget()] parquet_align_set, _ = prepare_feature_set("parquet-align", "ticker", quotes, timestamp_key="time", targets=targets) parquet_df = parquet_align_set.to_dataframe() features = ["parquet-align.*"] parquet_vec = fs.FeatureVector("parquet-align-vector", features) resp = fs.get_offline_features(parquet_vec) parquet_vec_df = resp.to_dataframe() assert all(csv_df == parquet_df) assert all(csv_vec_df == parquet_vec_df)
def test_split_graph(self): quotes_set = fs.FeatureSet("stock-quotes", entities=[fs.Entity("ticker")]) quotes_set.graph.to("MyMap", "somemap1", field="multi1", multiplier=3).to( "storey.Extend", _fn="({'extra': event['bid'] * 77})").to( "storey.Filter", "filter", _fn="(event['bid'] > 70)").to( FeaturesetValidator()) side_step_name = "side-step" quotes_set.graph.to("storey.Extend", name=side_step_name, _fn="({'extra2': event['bid'] * 17})") with pytest.raises(mlrun.errors.MLRunPreconditionFailedError): fs.infer_metadata(quotes_set, quotes) non_default_target_name = "side-target" quotes_set.set_targets( targets=[ CSVTarget(name=non_default_target_name, after_state=side_step_name) ], default_final_state="FeaturesetValidator", ) quotes_set.plot(with_targets=True) inf_out = fs.infer_metadata(quotes_set, quotes) ing_out = fs.ingest(quotes_set, quotes, return_df=True) default_file_path = quotes_set.get_target_path(TargetTypes.parquet) side_file_path = quotes_set.get_target_path(non_default_target_name) side_file_out = pd.read_csv(side_file_path) default_file_out = pd.read_parquet(default_file_path) self._split_graph_expected_default.set_index("ticker", inplace=True) assert all( self._split_graph_expected_default == default_file_out.round(2)) assert all(self._split_graph_expected_default == ing_out.round(2)) assert all(self._split_graph_expected_default == inf_out.round(2)) assert all( self._split_graph_expected_side.sort_index( axis=1) == side_file_out.sort_index(axis=1).round(2))
def test_overwrite(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")]) fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()]) features = ["overwrite-fs.*"] fvec = fs.FeatureVector("overwrite-vec", features=features) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0]["value"] == 3 svc.close() fs.ingest(fset, df2) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0] is None resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close()