def ingest( self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], chunk_size: int = 10000, version: int = None, force_update: bool = False, max_workers: int = max(CPU_COUNT - 1, 1), disable_progress_bar: bool = False, timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT, ) -> None: """ Loads feature data into Feast for a specific feature set. Args: feature_set (typing.Union[str, feast.feature_set.FeatureSet]): Feature set object or the string name of the feature set (without a version). source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json chunk_size (int): Amount of rows to load and ingest at a time. version (int): Feature set version. force_update (bool): Automatically update feature set based on source data prior to ingesting. This will also register changes to Feast. max_workers (int): Number of worker processes to use to encode values. disable_progress_bar (bool): Disable printing of progress statistics. timeout (int): Timeout in seconds to wait for completion. Returns: None: None """ if isinstance(feature_set, FeatureSet): name = feature_set.name if version is None: version = feature_set.version elif isinstance(feature_set, str): name = feature_set else: raise Exception(f"Feature set name must be provided") # Read table and get row count dir_path, dest_path = _read_table_from_source(source, chunk_size, max_workers) pq_file = pq.ParquetFile(dest_path) row_count = pq_file.metadata.num_rows # Update the feature set based on PyArrow table of first row group if force_update: feature_set.infer_fields_from_pa( table=pq_file.read_row_group(0), discard_unused_fields=True, replace_existing_features=True, ) self.apply(feature_set) current_time = time.time() print("Waiting for feature set to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError( "Timed out waiting for feature set to be ready") feature_set = self.get_feature_set(name, version) if (feature_set is not None and feature_set.status == FeatureSetStatus.STATUS_READY): break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) try: # Kafka configs brokers = feature_set.get_kafka_source_brokers() topic = feature_set.get_kafka_source_topic() producer = get_producer(brokers, row_count, disable_progress_bar) # Loop optimization declarations produce = producer.produce flush = producer.flush # Transform and push data to Kafka if feature_set.source.source_type == "Kafka": for chunk in get_feature_row_chunks( file=dest_path, row_groups=list(range(pq_file.num_row_groups)), fs=feature_set, max_workers=max_workers, ): # Push FeatureRow one chunk at a time to kafka for serialized_row in chunk: produce(topic=topic, value=serialized_row) # Force a flush after each chunk flush(timeout=timeout) # Remove chunk from memory del chunk else: raise Exception( f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"') # Print ingestion statistics producer.print_results() finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") shutil.rmtree(dir_path) return None
def ingest( self, feature_set: Union[str, FeatureSet], source: Union[pd.DataFrame, str], chunk_size: int = 10000, max_workers: int = max(CPU_COUNT - 1, 1), disable_progress_bar: bool = False, timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT, ) -> str: """ Loads feature data into Feast for a specific feature set. Args: feature_set (typing.Union[str, feast.feature_set.FeatureSet]): Feature set object or the string name of the feature set source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast Files that are currently supported: * parquet * csv * json chunk_size (int): Amount of rows to load and ingest at a time. max_workers (int): Number of worker processes to use to encode values. disable_progress_bar (bool): Disable printing of progress statistics. timeout (int): Timeout in seconds to wait for completion. Returns: str: ingestion id for this dataset Examples: >>> from feast import Client >>> >>> client = Client(core_url="localhost:6565") >>> fs_df = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now()], >>> "driver": [1001], >>> "rating": [4.3], >>> } >>> ) >>> client.set_project("project1") >>> client.ingest("driver", fs_df) >>> >>> driver_fs = client.get_feature_set(name="driver", project="project1") >>> client.ingest(driver_fs, fs_df) """ if isinstance(feature_set, FeatureSet): name = feature_set.name project = feature_set.project elif isinstance(feature_set, str): if self.project is not None: project = self.project else: project = "default" name = feature_set else: raise Exception("Feature set name must be provided") # Read table and get row count dir_path, dest_path = _read_table_from_source(source, chunk_size, max_workers) pq_file = pq.ParquetFile(dest_path) row_count = pq_file.metadata.num_rows current_time = time.time() print("Waiting for feature set to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError( "Timed out waiting for feature set to be ready") fetched_feature_set: Optional[FeatureSet] = self.get_feature_set( name, project) if (fetched_feature_set is not None and fetched_feature_set.status == FeatureSetStatus.STATUS_READY): feature_set = fetched_feature_set break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) try: # Kafka configs brokers = feature_set.get_kafka_source_brokers() topic = feature_set.get_kafka_source_topic() producer = get_producer(brokers, row_count, disable_progress_bar) # Loop optimization declarations produce = producer.produce flush = producer.flush ingestion_id = _generate_ingestion_id(feature_set) # Transform and push data to Kafka if feature_set.source.source_type == "Kafka": for chunk in get_feature_row_chunks( file=dest_path, row_groups=list(range(pq_file.num_row_groups)), fs=feature_set, ingestion_id=ingestion_id, max_workers=max_workers, ): # Push FeatureRow one chunk at a time to kafka for serialized_row in chunk: produce(topic=topic, value=serialized_row) # Force a flush after each chunk flush(timeout=timeout) # Remove chunk from memory del chunk else: raise Exception( f"Could not determine source type for feature set " f'"{feature_set.name}" with source type ' f'"{feature_set.source.source_type}"') # Print ingestion statistics producer.print_results() finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") shutil.rmtree(dir_path) return ingestion_id