def ingest_kafka( feature_set: FeatureSet, dataframe: pd.DataFrame, max_workers: int, timeout: int = None, chunk_size: int = 5000, disable_pbar: bool = False, ): pbar = tqdm(unit="rows", total=dataframe.shape[0], disable=disable_pbar) # Validate feature set schema validate_dataframe(dataframe, feature_set) # Split dataframe into chunks num_chunks = max(dataframe.shape[0] / max(chunk_size, 100), 1) df_chunks = np.array_split(dataframe, num_chunks) # Create queue through which encoding and production will coordinate chunk_queue = Queue() # Create a context object to send and receive information across processes ctx = multiprocessing.Manager().dict({ "success_count": 0, "error_count": 0, "last_exception": "" }) # Create producer to push feature rows to Kafka ingestion_process = Process( target=_kafka_feature_row_chunk_producer, args=( chunk_queue, num_chunks, feature_set.get_kafka_source_brokers(), feature_set.get_kafka_source_topic(), ctx, pbar, ), ) try: # Start ingestion process print( f"\nIngestion started for {feature_set.name}:{feature_set.version}" ) ingestion_process.start() # Create a pool of workers to convert df chunks into feature row chunks # and push them into the queue for ingestion to pick up with Pool(processes=max_workers) as pool: chunks_done = 0 while chunks_done < num_chunks: chunks_to = min(chunks_done + max_workers, len(df_chunks)) results = pool.starmap_async( _encode_chunk, zip(df_chunks[chunks_done:chunks_to], repeat(feature_set)), ) # Push feature row encoded chunks onto queue for result in results.get(): chunk_queue.put(result) chunks_done += max_workers except Exception as ex: _logger.error(f"Exception occurred: {ex}") finally: # Wait for ingestion to complete, or time out ingestion_process.join(timeout=timeout) failed_message = ("" if ctx["error_count"] == 0 else f"\nFail: {ctx['error_count']}/{dataframe.shape[0]}") last_exception_message = ( "" if ctx["last_exception"] == "" else f"\nLast exception:\n{ctx['last_exception']}") print(f"\nIngestion statistics:" f"\nSuccess: {ctx['success_count']}/{dataframe.shape[0]}" f"{failed_message}" f"{last_exception_message}")
def ingest_table_to_kafka( feature_set: FeatureSet, table: pa.lib.Table, max_workers: int, chunk_size: int = 5000, disable_pbar: bool = False, timeout: int = None, ) -> None: """ Ingest a PyArrow Table to a Kafka topic based for a Feature Set Args: feature_set: FeatureSet describing PyArrow table. table: PyArrow table to be processed. max_workers: Maximum number of workers. chunk_size: Maximum size of each chunk when PyArrow table is batched. disable_pbar: Flag to indicate if tqdm progress bar should be disabled. timeout: Maximum time before method times out """ pbar = tqdm(unit="rows", total=table.num_rows, disable=disable_pbar) # Use a small DataFrame to validate feature set schema ref_df = table.to_batches(max_chunksize=100)[0].to_pandas() df_datetime_dtype = ref_df[DATETIME_COLUMN].dtype # Validate feature set schema _validate_dataframe(ref_df, feature_set) # Create queue through which encoding and production will coordinate row_queue = Queue() # Create a context object to send and receive information across processes ctx = multiprocessing.Manager().dict({ "success_count": 0, "error_count": 0, "last_exception": "" }) # Create producer to push feature rows to Kafka ingestion_process = Process( target=_kafka_feature_row_producer, args=( row_queue, table.num_rows, feature_set.get_kafka_source_brokers(), feature_set.get_kafka_source_topic(), ctx, pbar, ), ) try: # Start ingestion process print( f"\n(ingest table to kafka) Ingestion started for {feature_set.name}:{feature_set.version}" ) ingestion_process.start() # Iterate over chunks in the table and return feature rows for row in _encode_pa_chunks( tbl=table, fs=feature_set, max_workers=max_workers, chunk_size=chunk_size, df_datetime_dtype=df_datetime_dtype, ): # Push rows onto a queue for the production process to pick up row_queue.put(row) while row_queue.qsize() > chunk_size: time.sleep(0.1) row_queue.put(None) except Exception as ex: _logger.error(f"Exception occurred: {ex}") finally: # Wait for the Kafka production to complete ingestion_process.join(timeout=timeout) failed_message = ("" if ctx["error_count"] == 0 else f"\nFail: {ctx['error_count']}/{table.num_rows}") last_exception_message = ( "" if ctx["last_exception"] == "" else f"\nLast exception:\n{ctx['last_exception']}") print(f"\nIngestion statistics:" f"\nSuccess: {ctx['success_count']}/{table.num_rows}" f"{failed_message}" f"{last_exception_message}")