Ejemplo n.º 1
0
def produce_feature_rows(entity_name, feature_infos, feature_values_filepath,
                         bootstrap_servers, topic):
    producer = KafkaProducer(bootstrap_servers=bootstrap_servers)
    feature_values = pd.read_csv(
        feature_values_filepath,
        names=["id", "event_timestamp"] + [f["name"] for f in feature_infos],
        dtype=dict([("id", np.string_)] + [(f["name"], f["dtype"])
                                           for f in feature_infos]),
        parse_dates=["event_timestamp"],
    )

    for i, row in feature_values.iterrows():
        feature_row = FeatureRow()
        feature_row.entityKey = row["id"]
        feature_row.entityName = entity_name

        timestamp = Timestamp()
        timestamp.FromJsonString(
            row["event_timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ"))
        feature_row.eventTimestamp.CopyFrom(timestamp)

        for info in feature_infos:
            feature = Feature()
            feature.id = info["id"]
            feature_value = Value()
            feature_name = info["name"]
            if info["dtype"] is "Int64":
                feature_value.int64Val = row[feature_name]
            elif info["dtype"] is "Int32":
                feature_value.int32Val = row[feature_name]
            elif info["dtype"] is np.float64:
                feature_value.doubleVal = row[feature_name]
            else:
                raise RuntimeError(
                    f"Unsupported dtype: {info['dtype']}\n"
                    "Supported valueType: INT32, INT64, FLOAT, DOUBLE\n"
                    "Please update your feature specs in testdata/feature_specs folder"
                )
            feature.value.CopyFrom(feature_value)
            feature_row.features.extend([feature])

        producer.send(topic, feature_row.SerializeToString())
        producer.flush()
Ejemplo n.º 2
0
Archivo: stores.py Proyecto: wzpy/feast
 def upsert_feature_row(
     self,
     feature_set: FeatureSetProto.FeatureSetSpec,
     feature_row: FeatureRowProto.FeatureRow,
 ):
     values = (feature_row.event_timestamp, )
     for entity in list(feature_set.entities):
         values = values + (get_feature_row_value_by_name(
             feature_row, entity.name), )
     values = values + (feature_row.SerializeToString(), )
     self._c.execute(build_sqlite_insert_feature_row_query(feature_set),
                     values)
Ejemplo n.º 3
0
def _encode_pa_tables(
    file: str, feature_set: str, fields: dict, ingestion_id: str, row_group_idx: int
) -> List[bytes]:
    """
    Helper function to encode a PyArrow table(s) read from parquet file(s) into
    FeatureRows.

    This function accepts a list of file directory pointing to many parquet
    files. All parquet files must have the same schema.

    Each parquet file will be read into as a table and encoded into FeatureRows
    using a pool of max_workers workers.

    Args:
        file (str):
            File directory of all the parquet file to encode.
            Parquet file must have more than one row group.

        feature_set (str):
            Feature set reference in the format f"{project}/{name}".

        fields (dict[str, enum.Enum.ValueType]):
            A mapping of field names to their value types.

        ingestion_id (str):
            UUID unique to this ingestion job.

        row_group_idx(int):
            Row group index to read and encode into byte like FeatureRow
            protobuf objects.

    Returns:
        List[bytes]:
            List of byte encoded FeatureRows from the parquet file.
    """
    pq_file = pq.ParquetFile(file)
    # Read parquet file as a PyArrow table
    table = pq_file.read_row_group(row_group_idx)

    # Add datetime column
    datetime_col = pa_column_to_timestamp_proto_column(table.column(DATETIME_COLUMN))

    # Preprocess the columns by converting all its values to Proto values
    proto_columns = {
        field_name: pa_column_to_proto_column(dtype, table.column(field_name))
        for field_name, dtype in fields.items()
    }

    # List to store result
    feature_rows: List[bytes] = []

    # Loop optimization declaration(s)
    field = FieldProto.Field
    proto_items = proto_columns.items()
    append = feature_rows.append

    # Iterate through the rows
    for row_idx in range(table.num_rows):
        feature_row = FeatureRow(
            event_timestamp=datetime_col[row_idx],
            feature_set=feature_set,
            ingestion_id=ingestion_id,
        )
        # Loop optimization declaration
        ext = feature_row.fields.extend

        # Insert field from each column
        for k, v in proto_items:
            ext([field(name=k, value=v[row_idx])])

        # Append FeatureRow in byte string form
        append(feature_row.SerializeToString())

    return feature_rows
Ejemplo n.º 4
0
def log_feature_row_messages(bootstrap_servers, topic):
    consumer = KafkaConsumer(topic, bootstrap_servers=bootstrap_servers)
    for record in consumer:
        feature_row = FeatureRow()
        feature_row.ParseFromString(record.value)
        print(feature_row)