Beispiel #1
0
def _encode_pa_chunks(
    tbl: pa.lib.Table,
    fs: FeatureSet,
    max_workers: int,
    df_datetime_dtype: pd.DataFrame.dtypes,
    chunk_size: int = 5000,
) -> Iterable[FeatureRow]:
    """
    Generator function to encode rows in PyArrow table to FeatureRows by
    breaking up the table into batches.

    Each batch will have its rows spread accross a pool of workers to be
    transformed into FeatureRow objects.

    Args:
        tbl: PyArrow table to be processed.
        fs: FeatureSet describing PyArrow table.
        max_workers: Maximum number of workers.
        df_datetime_dtype: Pandas dtype of datetime column.
        chunk_size: Maximum size of each chunk when PyArrow table is batched.

    Returns:
        Iterable FeatureRow object.
    """

    pool = Pool(max_workers)

    # Create a partial function with static non-iterable arguments
    func = partial(
        convert_dict_to_proto_values,
        df_datetime_dtype=df_datetime_dtype,
        feature_set=fs,
    )

    for batch in tbl.to_batches(max_chunksize=chunk_size):
        m_df = batch.to_pandas()
        results = pool.map_async(func, m_df.to_dict("records"))
        yield from results.get()

    pool.close()
    pool.join()
    return
Beispiel #2
0
    def infer_fields_from_pa(
        self,
        table: pa.lib.Table,
        entities: Optional[List[Entity]] = None,
        features: Optional[List[Feature]] = None,
        replace_existing_features: bool = False,
        replace_existing_entities: bool = False,
        discard_unused_fields: bool = False,
    ) -> None:
        """
        Adds fields (Features or Entities) to a feature set based on the schema
        of a PyArrow table. Only PyArrow tables are supported. All columns are
        detected as features, so setting at least one entity manually is
        advised.


        Args:
            table (pyarrow.lib.Table):
                PyArrow table to read schema from.

            entities (Optional[List[Entity]]):
                List of entities that will be set manually and not inferred.
                These will take precedence over any existing entities or
                entities found in the PyArrow table.

            features (Optional[List[Feature]]):
                List of features that will be set manually and not inferred.
                These will take precedence over any existing feature or features
                found in the PyArrow table.

            replace_existing_features (bool):
                Boolean flag. If true, will replace existing features in this
                feature set with features found in dataframe. If false, will
                skip conflicting features.

            replace_existing_entities (bool):
                Boolean flag. If true, will replace existing entities in this
                feature set with features found in dataframe. If false, will
                skip conflicting entities.

            discard_unused_fields (bool):
                Boolean flag. Setting this to True will discard any existing
                fields that are not found in the dataset or provided by the
                user.

        Returns:
            None:
                None
        """
        if entities is None:
            entities = list()
        if features is None:
            features = list()

        # Validate whether the datetime column exists with the right name
        if DATETIME_COLUMN not in table.column_names:
            raise Exception("No column 'datetime'")

        # Validate the date type for the datetime column
        if not isinstance(table.column(DATETIME_COLUMN).type, TimestampType):
            raise Exception(
                "Column 'datetime' does not have the correct type: datetime64[ms]"
            )

        # Create dictionary of fields that will not be inferred (manually set)
        provided_fields = OrderedDict()

        for field in entities + features:
            if not isinstance(field, Field):
                raise Exception(
                    f"Invalid field object type provided {type(field)}")
            if field.name not in provided_fields:
                provided_fields[field.name] = field
            else:
                raise Exception(f"Duplicate field name detected {field.name}.")

        new_fields = self._fields.copy()
        output_log = ""

        # Add in provided fields
        for name, field in provided_fields.items():
            if name in new_fields.keys():
                upsert_message = "created"
            else:
                upsert_message = "updated (replacing an existing field)"

            output_log += (f"{type(field).__name__} {field.name}"
                           f"({field.dtype}) manually {upsert_message}.\n")
            new_fields[name] = field

        # Iterate over all of the column names and create features
        for column in table.column_names:
            column = column.strip()

            # Skip datetime column
            if DATETIME_COLUMN in column:
                continue

            # Skip user provided fields
            if column in provided_fields.keys():
                continue

            # Only overwrite conflicting fields if replacement is allowed
            if column in new_fields:
                if (isinstance(self._fields[column], Feature)
                        and not replace_existing_features):
                    continue

                if (isinstance(self._fields[column], Entity)
                        and not replace_existing_entities):
                    continue

            # Store this fields as a feature
            # TODO: (Minor) Change the parameter name from dtype to patype
            new_fields[column] = Feature(name=column,
                                         dtype=self._infer_pa_column_type(
                                             table.column(column)))

            output_log += f"{type(new_fields[column]).__name__} {new_fields[column].name} ({new_fields[column].dtype}) added from PyArrow Table.\n"

        # Discard unused fields from feature set
        if discard_unused_fields:
            keys_to_remove = []
            for key in new_fields.keys():
                if not (key in table.column_names
                        or key in provided_fields.keys()):
                    output_log += f"{type(new_fields[key]).__name__} {new_fields[key].name} ({new_fields[key].dtype}) removed because it is unused.\n"
                    keys_to_remove.append(key)
            for key in keys_to_remove:
                del new_fields[key]

        # Update feature set
        self._fields = new_fields
        print(output_log)
Beispiel #3
0
def ingest_table_to_kafka(
    feature_set: FeatureSet,
    table: pa.lib.Table,
    max_workers: int,
    chunk_size: int = 5000,
    disable_pbar: bool = False,
    timeout: int = None,
) -> None:
    """
    Ingest a PyArrow Table to a Kafka topic based for a Feature Set

    Args:
        feature_set: FeatureSet describing PyArrow table.
        table: PyArrow table to be processed.
        max_workers: Maximum number of workers.
        chunk_size:  Maximum size of each chunk when PyArrow table is batched.
        disable_pbar: Flag to indicate if tqdm progress bar should be disabled.
        timeout: Maximum time before method times out
    """

    pbar = tqdm(unit="rows", total=table.num_rows, disable=disable_pbar)

    # Use a small DataFrame to validate feature set schema
    ref_df = table.to_batches(max_chunksize=100)[0].to_pandas()
    df_datetime_dtype = ref_df[DATETIME_COLUMN].dtype

    # Validate feature set schema
    _validate_dataframe(ref_df, feature_set)

    # Create queue through which encoding and production will coordinate
    row_queue = Queue()

    # Create a context object to send and receive information across processes
    ctx = multiprocessing.Manager().dict({
        "success_count": 0,
        "error_count": 0,
        "last_exception": ""
    })

    # Create producer to push feature rows to Kafka
    ingestion_process = Process(
        target=_kafka_feature_row_producer,
        args=(
            row_queue,
            table.num_rows,
            feature_set.get_kafka_source_brokers(),
            feature_set.get_kafka_source_topic(),
            ctx,
            pbar,
        ),
    )

    try:
        # Start ingestion process
        print(
            f"\n(ingest table to kafka) Ingestion started for {feature_set.name}:{feature_set.version}"
        )
        ingestion_process.start()

        # Iterate over chunks in the table and return feature rows
        for row in _encode_pa_chunks(
                tbl=table,
                fs=feature_set,
                max_workers=max_workers,
                chunk_size=chunk_size,
                df_datetime_dtype=df_datetime_dtype,
        ):
            # Push rows onto a queue for the production process to pick up
            row_queue.put(row)
            while row_queue.qsize() > chunk_size:
                time.sleep(0.1)
        row_queue.put(None)
    except Exception as ex:
        _logger.error(f"Exception occurred: {ex}")
    finally:
        # Wait for the Kafka production to complete
        ingestion_process.join(timeout=timeout)
        failed_message = ("" if ctx["error_count"] == 0 else
                          f"\nFail: {ctx['error_count']}/{table.num_rows}")

        last_exception_message = (
            "" if ctx["last_exception"] == "" else
            f"\nLast exception:\n{ctx['last_exception']}")
        print(f"\nIngestion statistics:"
              f"\nSuccess: {ctx['success_count']}/{table.num_rows}"
              f"{failed_message}"
              f"{last_exception_message}")