Esempio n. 1
0
def ingest_kafka(
    feature_set: FeatureSet,
    dataframe: pd.DataFrame,
    max_workers: int,
    timeout: int = None,
    chunk_size: int = 5000,
    disable_pbar: bool = False,
):
    pbar = tqdm(unit="rows", total=dataframe.shape[0], disable=disable_pbar)

    # Validate feature set schema
    validate_dataframe(dataframe, feature_set)

    # Split dataframe into chunks
    num_chunks = max(dataframe.shape[0] / max(chunk_size, 100), 1)
    df_chunks = np.array_split(dataframe, num_chunks)

    # Create queue through which encoding and production will coordinate
    chunk_queue = Queue()

    # Create a context object to send and receive information across processes
    ctx = multiprocessing.Manager().dict({
        "success_count": 0,
        "error_count": 0,
        "last_exception": ""
    })

    # Create producer to push feature rows to Kafka
    ingestion_process = Process(
        target=_kafka_feature_row_chunk_producer,
        args=(
            chunk_queue,
            num_chunks,
            feature_set.get_kafka_source_brokers(),
            feature_set.get_kafka_source_topic(),
            ctx,
            pbar,
        ),
    )

    try:
        # Start ingestion process
        print(
            f"\nIngestion started for {feature_set.name}:{feature_set.version}"
        )
        ingestion_process.start()

        # Create a pool of workers to convert df chunks into feature row chunks
        # and push them into the queue for ingestion to pick up
        with Pool(processes=max_workers) as pool:
            chunks_done = 0
            while chunks_done < num_chunks:
                chunks_to = min(chunks_done + max_workers, len(df_chunks))
                results = pool.starmap_async(
                    _encode_chunk,
                    zip(df_chunks[chunks_done:chunks_to], repeat(feature_set)),
                )

                # Push feature row encoded chunks onto queue
                for result in results.get():
                    chunk_queue.put(result)
                chunks_done += max_workers
    except Exception as ex:
        _logger.error(f"Exception occurred: {ex}")
    finally:
        # Wait for ingestion to complete, or time out
        ingestion_process.join(timeout=timeout)
        failed_message = ("" if ctx["error_count"] == 0 else
                          f"\nFail: {ctx['error_count']}/{dataframe.shape[0]}")

        last_exception_message = (
            "" if ctx["last_exception"] == "" else
            f"\nLast exception:\n{ctx['last_exception']}")
        print(f"\nIngestion statistics:"
              f"\nSuccess: {ctx['success_count']}/{dataframe.shape[0]}"
              f"{failed_message}"
              f"{last_exception_message}")
Esempio n. 2
0
File: ingest.py Progetto: wzpy/feast
def ingest_table_to_kafka(
    feature_set: FeatureSet,
    table: pa.lib.Table,
    max_workers: int,
    chunk_size: int = 5000,
    disable_pbar: bool = False,
    timeout: int = None,
) -> None:
    """
    Ingest a PyArrow Table to a Kafka topic based for a Feature Set

    Args:
        feature_set: FeatureSet describing PyArrow table.
        table: PyArrow table to be processed.
        max_workers: Maximum number of workers.
        chunk_size:  Maximum size of each chunk when PyArrow table is batched.
        disable_pbar: Flag to indicate if tqdm progress bar should be disabled.
        timeout: Maximum time before method times out
    """

    pbar = tqdm(unit="rows", total=table.num_rows, disable=disable_pbar)

    # Use a small DataFrame to validate feature set schema
    ref_df = table.to_batches(max_chunksize=100)[0].to_pandas()
    df_datetime_dtype = ref_df[DATETIME_COLUMN].dtype

    # Validate feature set schema
    _validate_dataframe(ref_df, feature_set)

    # Create queue through which encoding and production will coordinate
    row_queue = Queue()

    # Create a context object to send and receive information across processes
    ctx = multiprocessing.Manager().dict({
        "success_count": 0,
        "error_count": 0,
        "last_exception": ""
    })

    # Create producer to push feature rows to Kafka
    ingestion_process = Process(
        target=_kafka_feature_row_producer,
        args=(
            row_queue,
            table.num_rows,
            feature_set.get_kafka_source_brokers(),
            feature_set.get_kafka_source_topic(),
            ctx,
            pbar,
        ),
    )

    try:
        # Start ingestion process
        print(
            f"\n(ingest table to kafka) Ingestion started for {feature_set.name}:{feature_set.version}"
        )
        ingestion_process.start()

        # Iterate over chunks in the table and return feature rows
        for row in _encode_pa_chunks(
                tbl=table,
                fs=feature_set,
                max_workers=max_workers,
                chunk_size=chunk_size,
                df_datetime_dtype=df_datetime_dtype,
        ):
            # Push rows onto a queue for the production process to pick up
            row_queue.put(row)
            while row_queue.qsize() > chunk_size:
                time.sleep(0.1)
        row_queue.put(None)
    except Exception as ex:
        _logger.error(f"Exception occurred: {ex}")
    finally:
        # Wait for the Kafka production to complete
        ingestion_process.join(timeout=timeout)
        failed_message = ("" if ctx["error_count"] == 0 else
                          f"\nFail: {ctx['error_count']}/{table.num_rows}")

        last_exception_message = (
            "" if ctx["last_exception"] == "" else
            f"\nLast exception:\n{ctx['last_exception']}")
        print(f"\nIngestion statistics:"
              f"\nSuccess: {ctx['success_count']}/{table.num_rows}"
              f"{failed_message}"
              f"{last_exception_message}")