def test_read_csv(self): from storey import ReadCSV, ReduceToDataFrame, build_flow csv_path = str(self.results_path / _generate_random_name() / ".csv") targets = [CSVTarget("mycsv", path=csv_path)] stocks_set = fs.FeatureSet( "tests", entities=[Entity("ticker", ValueType.STRING)]) fs.ingest(stocks_set, stocks, infer_options=fs.InferOptions.default(), targets=targets) # reading csv file controller = build_flow([ReadCSV(csv_path), ReduceToDataFrame()]).run() termination_result = controller.await_termination() expected = pd.DataFrame({ 0: ["ticker", "MSFT", "GOOG", "AAPL"], 1: ["name", "Microsoft Corporation", "Alphabet Inc", "Apple Inc"], 2: ["exchange", "NASDAQ", "NASDAQ", "NASDAQ"], }) assert termination_result.equals( expected), f"{termination_result}\n!=\n{expected}" os.remove(csv_path)
self._flow = build_flow([ SyncEmitSource(), ProcessEndpointEvent(self.kv_container, self.kv_path), FilterNotNone(), FlatMap(lambda x: x), MapFeatureNames(self.kv_container, self.kv_path), # Branch 1: Aggregate events, count averages and update TSDB and KV [ AggregateByKey( aggregates=[ FieldAggregator( PREDICTIONS, ENDPOINT_ID, ["count"], SlidingWindows( self.aggregate_count_windows, self.aggregate_count_period, ), ), FieldAggregator( LATENCY, LATENCY, ["avg"], SlidingWindows( self.aggregate_avg_windows, self.aggregate_avg_period, ), ), ], table=Table("notable", NoopDriver()), ), SampleWindow( self.sample_window ), # Add required gap between event to apply sampling Map(self.compute_predictions_per_second), # Branch 1.1: Updated KV [ Map(self.process_before_kv), WriteToKV(container=self.kv_container, table=self.kv_path), InferSchema( v3io_access_key=self.v3io_access_key, v3io_framesd=self.v3io_framesd, container=self.kv_container, table=self.kv_path, ), ], # Branch 1.2: Update TSDB [ # Map the event into taggable fields, add record type to each field Map(self.process_before_events_tsdb), [ FilterKeys(BASE_METRICS), UnpackValues(BASE_METRICS), TSDBTarget( path=self.tsdb_path, rate="10/m", time_col=TIMESTAMP, container=self.tsdb_container, access_key=self.v3io_access_key, v3io_frames=self.v3io_framesd, index_cols=[ENDPOINT_ID, RECORD_TYPE], # Settings for _Batching max_events=self.tsdb_batching_max_events, timeout_secs=self.tsdb_batching_timeout_secs, key=ENDPOINT_ID, ), ], [ FilterKeys(ENDPOINT_FEATURES), UnpackValues(ENDPOINT_FEATURES), TSDBTarget( path=self.tsdb_path, rate="10/m", time_col=TIMESTAMP, container=self.tsdb_container, access_key=self.v3io_access_key, v3io_frames=self.v3io_framesd, index_cols=[ENDPOINT_ID, RECORD_TYPE], # Settings for _Batching max_events=self.tsdb_batching_max_events, timeout_secs=self.tsdb_batching_timeout_secs, key=ENDPOINT_ID, ), ], [ FilterKeys(CUSTOM_METRICS), FilterNotNone(), UnpackValues(CUSTOM_METRICS), TSDBTarget( path=self.tsdb_path, rate="10/m", time_col=TIMESTAMP, container=self.tsdb_container, access_key=self.v3io_access_key, v3io_frames=self.v3io_framesd, index_cols=[ENDPOINT_ID, RECORD_TYPE], # Settings for _Batching max_events=self.tsdb_batching_max_events, timeout_secs=self.tsdb_batching_timeout_secs, key=ENDPOINT_ID, ), ], ], ], # Branch 2: Batch events, write to parquet [ Map(self.process_before_parquet), ParquetTarget( path=self.parquet_path, partition_cols=[ "$key", "$year", "$month", "$day", "$hour" ], infer_columns_from_data=True, # Settings for _Batching max_events=self.parquet_batching_max_events, timeout_secs=self.parquet_batching_timeout_secs, ), ], ]).run()