def test_serverless_ingest(): init_store() key = "patient_id" measurements = fs.FeatureSet("measurements", entities=[Entity(key)], timestamp_key="timestamp") target_path = os.path.relpath(results_dir + "mycsv.csv") source = CSVSource("mycsv", path=os.path.relpath(local_dir + "testdata.csv")) targets = [CSVTarget("mycsv", path=target_path)] if os.path.exists(target_path): os.remove(target_path) run_ingestion_job( measurements, source, targets, name="test_ingest", infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats, parameters={}, function=None, local=True, ) assert os.path.exists(target_path), "result file was not generated" features = sorted(measurements.spec.features.keys()) stats = sorted(measurements.status.stats.keys()) print(features) print(stats) stats.remove("timestamp") assert features == stats, "didnt infer stats for all features" print(measurements.to_yaml())
def test_serverless_ingest(self): key = "patient_id" measurements = fs.FeatureSet("measurements", entities=[Entity(key)], timestamp_key="timestamp") target_path = os.path.relpath(str(self.results_path / "mycsv.csv")) source = CSVSource("mycsv", path=os.path.relpath( str(self.assets_path / "testdata.csv"))) targets = [CSVTarget("mycsv", path=target_path)] if os.path.exists(target_path): os.remove(target_path) fs.ingest( measurements, source, targets, infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats, run_config=fs.RunConfig(local=True), ) assert os.path.exists(target_path), "result file was not generated" features = sorted(measurements.spec.features.keys()) stats = sorted(measurements.status.stats.keys()) print(features) print(stats) stats.remove("timestamp") assert features == stats, "didnt infer stats for all features"
def test_read_csv(self): from storey import CSVSource, ReduceToDataFrame, build_flow csv_path = str(self.results_path / _generate_random_name() / ".csv") targets = [CSVTarget("mycsv", path=csv_path)] stocks_set = fs.FeatureSet( "tests", entities=[Entity("ticker", ValueType.STRING)] ) fs.ingest( stocks_set, stocks, infer_options=fs.InferOptions.default(), targets=targets ) # reading csv file controller = build_flow([CSVSource(csv_path), ReduceToDataFrame()]).run() termination_result = controller.await_termination() expected = pd.DataFrame( { 0: ["ticker", "MSFT", "GOOG", "AAPL"], 1: ["name", "Microsoft Corporation", "Alphabet Inc", "Apple Inc"], 2: ["exchange", "NASDAQ", "NASDAQ", "NASDAQ"], } ) assert termination_result.equals( expected ), f"{termination_result}\n!=\n{expected}" os.remove(csv_path)
def test_csv_time_columns(self): df = pd.DataFrame( { "key": ["key1", "key2"], "time_stamp": [ datetime(2020, 11, 1, 17, 33, 15), datetime(2020, 10, 1, 17, 33, 15), ], "another_time_column": [ datetime(2020, 9, 1, 17, 33, 15), datetime(2020, 8, 1, 17, 33, 15), ], } ) csv_path = "/tmp/multiple_time_columns.csv" df.to_csv(path_or_buf=csv_path, index=False) source = CSVSource( path=csv_path, time_field="time_stamp", parse_dates=["another_time_column"] ) measurements = fs.FeatureSet( "fs", entities=[Entity("key")], timestamp_key="time_stamp" ) try: resp = fs.ingest(measurements, source) df.set_index("key", inplace=True) assert_frame_equal(df, resp) finally: os.remove(csv_path)
def test_purge(self): key = "patient_id" fset = fs.FeatureSet("purge", entities=[Entity(key)], timestamp_key="timestamp") path = os.path.relpath(str(self.assets_path / "testdata.csv")) source = CSVSource( "mycsv", path=path, time_field="timestamp", ) targets = [ CSVTarget(), CSVTarget(name="specified-path", path="v3io:///bigdata/csv-purge-test.csv"), ParquetTarget(partitioned=True, partition_cols=["timestamp"]), NoSqlTarget(), ] fset.set_targets( targets=targets, with_defaults=False, ) fs.ingest(fset, source) verify_purge(fset, targets) fs.ingest(fset, source) targets_to_purge = targets[:-1] verify_purge(fset, targets_to_purge)
def test_ingest_with_timestamp(self): key = "patient_id" measurements = fs.FeatureSet("measurements", entities=[Entity(key)], timestamp_key="timestamp") source = CSVSource( "mycsv", path=os.path.relpath(str(self.assets_path / "testdata.csv")), time_field="timestamp", ) resp = fs.ingest(measurements, source) assert resp["timestamp"].head( n=1)[0] == datetime.fromisoformat("2020-12-01 17:24:15.906352")
def test_non_partitioned_target_in_dir(self): source = CSVSource( "mycsv", path=os.path.relpath(str(self.assets_path / "testdata.csv")) ) path = str(self.results_path / _generate_random_name()) target = ParquetTarget(path=path) fset = fs.FeatureSet( name="test", entities=[Entity("patient_id")], timestamp_key="timestamp" ) fs.ingest(fset, source, targets=[target]) list_files = os.listdir(path) assert len(list_files) == 1 and not os.path.isdir(path + "/" + list_files[0]) os.remove(path + "/" + list_files[0])
def test_ingest_partitioned_by_key_and_time( self, key_bucketing_number, partition_cols, time_partitioning_granularity ): key = "patient_id" name = f"measurements_{uuid.uuid4()}" measurements = fs.FeatureSet(name, entities=[Entity(key)]) source = CSVSource( "mycsv", path=os.path.relpath(str(self.assets_path / "testdata.csv")), time_field="timestamp", ) measurements.set_targets( targets=[ ParquetTarget( partitioned=True, key_bucketing_number=key_bucketing_number, partition_cols=partition_cols, time_partitioning_granularity=time_partitioning_granularity, ) ], with_defaults=False, ) resp1 = fs.ingest(measurements, source) features = [ f"{name}.*", ] vector = fs.FeatureVector("myvector", features) resp = fs.get_offline_features(vector) resp2 = resp.to_dataframe() assert resp1.to_dict() == resp2.to_dict() file_system = fsspec.filesystem("v3io") kind = TargetTypes.parquet path = f"{get_default_prefix_for_target(kind)}/sets/{name}-latest" path = path.format(name=name, kind=kind, project="system-test-project") dataset = pq.ParquetDataset(path, filesystem=file_system,) partitions = [key for key, _ in dataset.pieces[0].partition_keys] if key_bucketing_number is None: expected_partitions = [] elif key_bucketing_number == 0: expected_partitions = ["igzpart_key"] else: expected_partitions = [f"igzpart_hash{key_bucketing_number}_key"] expected_partitions += partition_cols or [] if all( value is None for value in [ key_bucketing_number, partition_cols, time_partitioning_granularity, ] ): time_partitioning_granularity = "hour" if time_partitioning_granularity: for unit in ["year", "month", "day", "hour"]: expected_partitions.append(f"igzpart_{unit}") if unit == time_partitioning_granularity: break assert partitions == expected_partitions resp = fs.get_offline_features( vector, start_time=datetime(2020, 12, 1, 17, 33, 15), end_time=datetime(2020, 12, 1, 17, 33, 16), entity_timestamp_column="timestamp", ) resp2 = resp.to_dataframe() assert len(resp2) == 10