def _encode_pa_chunks( tbl: pa.lib.Table, fs: FeatureSet, max_workers: int, df_datetime_dtype: pd.DataFrame.dtypes, chunk_size: int = 5000, ) -> Iterable[FeatureRow]: """ Generator function to encode rows in PyArrow table to FeatureRows by breaking up the table into batches. Each batch will have its rows spread accross a pool of workers to be transformed into FeatureRow objects. Args: tbl: PyArrow table to be processed. fs: FeatureSet describing PyArrow table. max_workers: Maximum number of workers. df_datetime_dtype: Pandas dtype of datetime column. chunk_size: Maximum size of each chunk when PyArrow table is batched. Returns: Iterable FeatureRow object. """ pool = Pool(max_workers) # Create a partial function with static non-iterable arguments func = partial( convert_dict_to_proto_values, df_datetime_dtype=df_datetime_dtype, feature_set=fs, ) for batch in tbl.to_batches(max_chunksize=chunk_size): m_df = batch.to_pandas() results = pool.map_async(func, m_df.to_dict("records")) yield from results.get() pool.close() pool.join() return
def infer_fields_from_pa( self, table: pa.lib.Table, entities: Optional[List[Entity]] = None, features: Optional[List[Feature]] = None, replace_existing_features: bool = False, replace_existing_entities: bool = False, discard_unused_fields: bool = False, ) -> None: """ Adds fields (Features or Entities) to a feature set based on the schema of a PyArrow table. Only PyArrow tables are supported. All columns are detected as features, so setting at least one entity manually is advised. Args: table (pyarrow.lib.Table): PyArrow table to read schema from. entities (Optional[List[Entity]]): List of entities that will be set manually and not inferred. These will take precedence over any existing entities or entities found in the PyArrow table. features (Optional[List[Feature]]): List of features that will be set manually and not inferred. These will take precedence over any existing feature or features found in the PyArrow table. replace_existing_features (bool): Boolean flag. If true, will replace existing features in this feature set with features found in dataframe. If false, will skip conflicting features. replace_existing_entities (bool): Boolean flag. If true, will replace existing entities in this feature set with features found in dataframe. If false, will skip conflicting entities. discard_unused_fields (bool): Boolean flag. Setting this to True will discard any existing fields that are not found in the dataset or provided by the user. Returns: None: None """ if entities is None: entities = list() if features is None: features = list() # Validate whether the datetime column exists with the right name if DATETIME_COLUMN not in table.column_names: raise Exception("No column 'datetime'") # Validate the date type for the datetime column if not isinstance(table.column(DATETIME_COLUMN).type, TimestampType): raise Exception( "Column 'datetime' does not have the correct type: datetime64[ms]" ) # Create dictionary of fields that will not be inferred (manually set) provided_fields = OrderedDict() for field in entities + features: if not isinstance(field, Field): raise Exception( f"Invalid field object type provided {type(field)}") if field.name not in provided_fields: provided_fields[field.name] = field else: raise Exception(f"Duplicate field name detected {field.name}.") new_fields = self._fields.copy() output_log = "" # Add in provided fields for name, field in provided_fields.items(): if name in new_fields.keys(): upsert_message = "created" else: upsert_message = "updated (replacing an existing field)" output_log += (f"{type(field).__name__} {field.name}" f"({field.dtype}) manually {upsert_message}.\n") new_fields[name] = field # Iterate over all of the column names and create features for column in table.column_names: column = column.strip() # Skip datetime column if DATETIME_COLUMN in column: continue # Skip user provided fields if column in provided_fields.keys(): continue # Only overwrite conflicting fields if replacement is allowed if column in new_fields: if (isinstance(self._fields[column], Feature) and not replace_existing_features): continue if (isinstance(self._fields[column], Entity) and not replace_existing_entities): continue # Store this fields as a feature # TODO: (Minor) Change the parameter name from dtype to patype new_fields[column] = Feature(name=column, dtype=self._infer_pa_column_type( table.column(column))) output_log += f"{type(new_fields[column]).__name__} {new_fields[column].name} ({new_fields[column].dtype}) added from PyArrow Table.\n" # Discard unused fields from feature set if discard_unused_fields: keys_to_remove = [] for key in new_fields.keys(): if not (key in table.column_names or key in provided_fields.keys()): output_log += f"{type(new_fields[key]).__name__} {new_fields[key].name} ({new_fields[key].dtype}) removed because it is unused.\n" keys_to_remove.append(key) for key in keys_to_remove: del new_fields[key] # Update feature set self._fields = new_fields print(output_log)
def ingest_table_to_kafka( feature_set: FeatureSet, table: pa.lib.Table, max_workers: int, chunk_size: int = 5000, disable_pbar: bool = False, timeout: int = None, ) -> None: """ Ingest a PyArrow Table to a Kafka topic based for a Feature Set Args: feature_set: FeatureSet describing PyArrow table. table: PyArrow table to be processed. max_workers: Maximum number of workers. chunk_size: Maximum size of each chunk when PyArrow table is batched. disable_pbar: Flag to indicate if tqdm progress bar should be disabled. timeout: Maximum time before method times out """ pbar = tqdm(unit="rows", total=table.num_rows, disable=disable_pbar) # Use a small DataFrame to validate feature set schema ref_df = table.to_batches(max_chunksize=100)[0].to_pandas() df_datetime_dtype = ref_df[DATETIME_COLUMN].dtype # Validate feature set schema _validate_dataframe(ref_df, feature_set) # Create queue through which encoding and production will coordinate row_queue = Queue() # Create a context object to send and receive information across processes ctx = multiprocessing.Manager().dict({ "success_count": 0, "error_count": 0, "last_exception": "" }) # Create producer to push feature rows to Kafka ingestion_process = Process( target=_kafka_feature_row_producer, args=( row_queue, table.num_rows, feature_set.get_kafka_source_brokers(), feature_set.get_kafka_source_topic(), ctx, pbar, ), ) try: # Start ingestion process print( f"\n(ingest table to kafka) Ingestion started for {feature_set.name}:{feature_set.version}" ) ingestion_process.start() # Iterate over chunks in the table and return feature rows for row in _encode_pa_chunks( tbl=table, fs=feature_set, max_workers=max_workers, chunk_size=chunk_size, df_datetime_dtype=df_datetime_dtype, ): # Push rows onto a queue for the production process to pick up row_queue.put(row) while row_queue.qsize() > chunk_size: time.sleep(0.1) row_queue.put(None) except Exception as ex: _logger.error(f"Exception occurred: {ex}") finally: # Wait for the Kafka production to complete ingestion_process.join(timeout=timeout) failed_message = ("" if ctx["error_count"] == 0 else f"\nFail: {ctx['error_count']}/{table.num_rows}") last_exception_message = ( "" if ctx["last_exception"] == "" else f"\nLast exception:\n{ctx['last_exception']}") print(f"\nIngestion statistics:" f"\nSuccess: {ctx['success_count']}/{table.num_rows}" f"{failed_message}" f"{last_exception_message}")