Esempio n. 1
0
    def from_proto(cls, feature_table_proto: FeatureTableProto):
        """
        Creates a feature table from a protobuf representation of a feature table

        Args:
            feature_table_proto: A protobuf representation of a feature table

        Returns:
            Returns a FeatureTableProto object based on the feature table protobuf
        """

        feature_table = cls(
            name=feature_table_proto.spec.name,
            entities=[entity for entity in feature_table_proto.spec.entities],
            features=[
                Feature.from_proto(feature)
                for feature in feature_table_proto.spec.features
            ],
            labels=feature_table_proto.spec.labels,
            max_age=(None if feature_table_proto.spec.max_age.seconds == 0
                     and feature_table_proto.spec.max_age.nanos == 0 else
                     feature_table_proto.spec.max_age),
            batch_source=DataSource.from_proto(
                feature_table_proto.spec.batch_source),
            stream_source=(
                None
                if not feature_table_proto.spec.stream_source.ByteSize() else
                DataSource.from_proto(feature_table_proto.spec.stream_source)),
        )

        feature_table._created_timestamp = feature_table_proto.meta.created_timestamp

        return feature_table
Esempio n. 2
0
    def from_proto(cls, feature_view_proto: FeatureViewProto):
        """
        Creates a feature view from a protobuf representation of a feature view.

        Args:
            feature_view_proto: A protobuf representation of a feature view.

        Returns:
            A FeatureViewProto object based on the feature view protobuf.
        """
        batch_source = DataSource.from_proto(feature_view_proto.spec.batch_source)
        stream_source = (
            DataSource.from_proto(feature_view_proto.spec.stream_source)
            if feature_view_proto.spec.HasField("stream_source")
            else None
        )
        feature_view = cls(
            name=feature_view_proto.spec.name,
            entities=[entity for entity in feature_view_proto.spec.entities],
            features=[
                Feature(
                    name=feature.name,
                    dtype=ValueType(feature.value_type),
                    labels=dict(feature.labels),
                )
                for feature in feature_view_proto.spec.features
            ],
            tags=dict(feature_view_proto.spec.tags),
            online=feature_view_proto.spec.online,
            ttl=(
                None
                if feature_view_proto.spec.ttl.seconds == 0
                and feature_view_proto.spec.ttl.nanos == 0
                else feature_view_proto.spec.ttl
            ),
            batch_source=batch_source,
            stream_source=stream_source,
        )

        if feature_view_proto.meta.HasField("created_timestamp"):
            feature_view.created_timestamp = (
                feature_view_proto.meta.created_timestamp.ToDatetime()
            )
        if feature_view_proto.meta.HasField("last_updated_timestamp"):
            feature_view.last_updated_timestamp = (
                feature_view_proto.meta.last_updated_timestamp.ToDatetime()
            )

        for interval in feature_view_proto.meta.materialization_intervals:
            feature_view.materialization_intervals.append(
                (
                    utils.make_tzaware(interval.start_time.ToDatetime()),
                    utils.make_tzaware(interval.end_time.ToDatetime()),
                )
            )

        return feature_view
Esempio n. 3
0
    def from_proto(cls, feature_view_proto: FeatureViewProto):
        """
        Creates a feature view from a protobuf representation of a feature view.

        Args:
            feature_view_proto: A protobuf representation of a feature view.

        Returns:
            A FeatureViewProto object based on the feature view protobuf.
        """
        batch_source = DataSource.from_proto(
            feature_view_proto.spec.batch_source)
        stream_source = (
            DataSource.from_proto(feature_view_proto.spec.stream_source)
            if feature_view_proto.spec.HasField("stream_source") else None)
        feature_view = cls(
            name=feature_view_proto.spec.name,
            entities=[entity for entity in feature_view_proto.spec.entities],
            schema=[
                Field.from_proto(field_proto)
                for field_proto in feature_view_proto.spec.features
            ],
            description=feature_view_proto.spec.description,
            tags=dict(feature_view_proto.spec.tags),
            owner=feature_view_proto.spec.owner,
            online=feature_view_proto.spec.online,
            ttl=(timedelta(
                days=0) if feature_view_proto.spec.ttl.ToNanoseconds() == 0
                 else feature_view_proto.spec.ttl.ToTimedelta()),
            source=batch_source,
        )
        if stream_source:
            feature_view.stream_source = stream_source

        # FeatureViewProjections are not saved in the FeatureView proto.
        # Create the default projection.
        feature_view.projection = FeatureViewProjection.from_definition(
            feature_view)

        if feature_view_proto.meta.HasField("created_timestamp"):
            feature_view.created_timestamp = (
                feature_view_proto.meta.created_timestamp.ToDatetime())
        if feature_view_proto.meta.HasField("last_updated_timestamp"):
            feature_view.last_updated_timestamp = (
                feature_view_proto.meta.last_updated_timestamp.ToDatetime())

        for interval in feature_view_proto.meta.materialization_intervals:
            feature_view.materialization_intervals.append((
                utils.make_tzaware(interval.start_time.ToDatetime()),
                utils.make_tzaware(interval.end_time.ToDatetime()),
            ))

        return feature_view
Esempio n. 4
0
    def apply_data_source(self,
                          data_source: DataSource,
                          project: str,
                          commit: bool = True):
        """
        Registers a single data source with Feast

        Args:
            data_source: A data source that will be registered
            project: Feast project that this data source belongs to
            commit: Whether to immediately commit to the registry
        """
        registry = self._prepare_registry_for_changes()
        for idx, existing_data_source_proto in enumerate(
                registry.data_sources):
            if existing_data_source_proto.name == data_source.name:
                del registry.data_sources[idx]
        data_source_proto = data_source.to_proto()
        data_source_proto.data_source_class_type = (
            f"{data_source.__class__.__module__}.{data_source.__class__.__name__}"
        )
        data_source_proto.project = project
        data_source_proto.data_source_class_type = (
            f"{data_source.__class__.__module__}.{data_source.__class__.__name__}"
        )
        registry.data_sources.append(data_source_proto)
        if commit:
            self.commit()
Esempio n. 5
0
    def pull_all_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        start_date: datetime,
        end_date: datetime,
        user: str = "user",
        auth: Optional[Authentication] = None,
        http_scheme: Optional[str] = None,
    ) -> RetrievalJob:
        if not isinstance(data_source, TrinoSource):
            raise ValueError(
                f"The data_source object is not a TrinoSource object but is instead a {type(data_source)}"
            )
        from_expression = data_source.get_table_query_string()

        client = _get_trino_client(config=config,
                                   user=user,
                                   auth=auth,
                                   http_scheme=http_scheme)
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 [event_timestamp_column])
        query = f"""
            SELECT {field_string}
            FROM {from_expression}
            WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}'  AND TIMESTAMP '{end_date}'
        """
        return TrinoRetrievalJob(
            query=query,
            client=client,
            config=config,
            full_feature_names=False,
        )
Esempio n. 6
0
    def from_proto(cls, feature_view_proto: FeatureViewProto):
        """
        Creates a feature view from a protobuf representation of a feature view

        Args:
            feature_view_proto: A protobuf representation of a feature view

        Returns:
            Returns a FeatureViewProto object based on the feature view protobuf
        """

        feature_view = cls(
            name=feature_view_proto.spec.name,
            entities=[entity for entity in feature_view_proto.spec.entities],
            features=[
                Feature(
                    name=feature.name,
                    dtype=ValueType(feature.value_type),
                    labels=feature.labels,
                ) for feature in feature_view_proto.spec.features
            ],
            tags=dict(feature_view_proto.spec.tags),
            online=feature_view_proto.spec.online,
            ttl=(None if feature_view_proto.spec.ttl.seconds == 0
                 and feature_view_proto.spec.ttl.nanos == 0 else
                 feature_view_proto.spec.ttl),
            input=DataSource.from_proto(feature_view_proto.spec.input),
        )

        feature_view.created_timestamp = feature_view_proto.meta.created_timestamp

        return feature_view
Esempio n. 7
0
    def pull_all_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, RedshiftSource)
        from_expression = data_source.get_table_query_string()

        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 [event_timestamp_column])

        redshift_client = aws_utils.get_redshift_data_client(
            config.offline_store.region)
        s3_resource = aws_utils.get_s3_resource(config.offline_store.region)

        start_date = start_date.astimezone(tz=utc)
        end_date = end_date.astimezone(tz=utc)

        query = f"""
            SELECT {field_string}
            FROM {from_expression}
            WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}'
        """

        return RedshiftRetrievalJob(
            query=query,
            redshift_client=redshift_client,
            s3_resource=s3_resource,
            config=config,
            full_feature_names=False,
        )
Esempio n. 8
0
    def pull_all_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, BigQuerySource)
        from_expression = data_source.get_table_query_string()

        client = _get_bigquery_client(
            project=config.offline_store.project_id,
            location=config.offline_store.location,
        )
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 [event_timestamp_column])
        query = f"""
            SELECT {field_string}
            FROM {from_expression}
            WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}')
        """
        return BigQueryRetrievalJob(
            query=query,
            client=client,
            config=config,
            full_feature_names=False,
        )
Esempio n. 9
0
    def pull_latest_from_table_or_query(
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> pyarrow.Table:
        assert isinstance(data_source, BigQuerySource)
        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        timestamps = [event_timestamp_column]
        if created_timestamp_column:
            timestamps.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamps) + " DESC"
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 timestamps)

        query = f"""
            SELECT {field_string}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM {from_expression}
                WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}')
            )
            WHERE _feast_row = 1
            """

        return BigQueryOfflineStore._pull_query(query)
Esempio n. 10
0
    def GetHistoricalFeatures(self, request: GetHistoricalFeaturesRequest,
                              context):
        """Produce a training dataset, return a job id that will provide a file reference"""

        if not self.is_whitelisted(request.project):
            raise ValueError(
                f"Project {request.project} is not whitelisted. Please contact your Feast administrator to whitelist it."
            )

        job = start_historical_feature_retrieval_job(
            client=self.client,
            project=request.project,
            entity_source=DataSource.from_proto(request.entity_source),
            feature_tables=self.client._get_feature_tables_from_feature_refs(
                list(request.feature_refs), request.project),
            output_format=request.output_format,
            output_path=request.output_location,
        )

        output_file_uri = job.get_output_file_uri(block=False)

        job_start_timestamp = Timestamp()
        job_start_timestamp.FromDatetime(job.get_start_time())

        return GetHistoricalFeaturesResponse(
            id=job.get_id(),
            output_file_uri=output_file_uri,
            job_start_time=job_start_timestamp,
        )
Esempio n. 11
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
        user: str = "user",
        auth: Optional[Authentication] = None,
        http_scheme: Optional[str] = None,
    ) -> TrinoRetrievalJob:
        if not isinstance(data_source, TrinoSource):
            raise ValueError(
                f"The data_source object is not a TrinoSource but is instead '{type(data_source)}'"
            )
        if not isinstance(config.offline_store, TrinoOfflineStoreConfig):
            raise ValueError(
                f"The config.offline_store object is not a TrinoOfflineStoreConfig but is instead '{type(config.offline_store)}'"
            )

        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        timestamps = [event_timestamp_column]
        if created_timestamp_column:
            timestamps.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamps) + " DESC"
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 timestamps)

        client = _get_trino_client(config=config,
                                   user=user,
                                   auth=auth,
                                   http_scheme=http_scheme)

        query = f"""
            SELECT
                {field_string}
                {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM {from_expression}
                WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}'
            )
            WHERE _feast_row = 1
            """

        # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized
        return TrinoRetrievalJob(
            query=query,
            client=client,
            config=config,
            full_feature_names=False,
        )
Esempio n. 12
0
    def pull_all_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, PostgreSQLSource)
        from_expression = data_source.get_table_query_string()

        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 [event_timestamp_column])

        start_date = start_date.astimezone(tz=utc)
        end_date = end_date.astimezone(tz=utc)

        query = f"""
            SELECT {field_string}
            FROM {from_expression}
            WHERE "{event_timestamp_column}" BETWEEN '{start_date}'::timestamptz AND '{end_date}'::timestamptz
        """

        return PostgreSQLRetrievalJob(
            query=query,
            config=config,
            full_feature_names=False,
            on_demand_feature_views=None,
        )
Esempio n. 13
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, SnowflakeSource)
        assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig)

        from_expression = (data_source.get_table_query_string()
                           )  # returns schema.table as a string

        if join_key_columns:
            partition_by_join_key_string = '"' + '", "'.join(
                join_key_columns) + '"'
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        else:
            partition_by_join_key_string = ""

        timestamp_columns = [event_timestamp_column]
        if created_timestamp_column:
            timestamp_columns.append(created_timestamp_column)

        timestamp_desc_string = '"' + '" DESC, "'.join(
            timestamp_columns) + '" DESC'
        field_string = ('"' +
                        '", "'.join(join_key_columns + feature_name_columns +
                                    timestamp_columns) + '"')

        if data_source.snowflake_options.warehouse:
            config.offline_store.warehouse = data_source.snowflake_options.warehouse

        snowflake_conn = get_snowflake_conn(config.offline_store)

        query = f"""
            SELECT
                {field_string}
                {f''', TRIM({repr(DUMMY_ENTITY_VAL)}::VARIANT,'"') AS "{DUMMY_ENTITY_ID}"''' if not join_key_columns else ""}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS "_feast_row"
                FROM {from_expression}
                WHERE "{event_timestamp_column}" BETWEEN TO_TIMESTAMP_NTZ({start_date.timestamp()}) AND TO_TIMESTAMP_NTZ({end_date.timestamp()})
            )
            WHERE "_feast_row" = 1
            """

        return SnowflakeRetrievalJob(
            query=query,
            snowflake_conn=snowflake_conn,
            config=config,
            full_feature_names=False,
            on_demand_feature_views=None,
        )
Esempio n. 14
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        spark_session = get_spark_session_or_start_new_with_repoconfig(
            config.offline_store)
        assert isinstance(config.offline_store, SparkOfflineStoreConfig)
        assert isinstance(data_source, SparkSource)

        warnings.warn(
            "The spark offline store is an experimental feature in alpha development. "
            "Some functionality may still be unstable so functionality can change in the future.",
            RuntimeWarning,
        )

        print("Pulling latest features from spark offline store")

        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        timestamps = [event_timestamp_column]
        if created_timestamp_column:
            timestamps.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamps) + " DESC"
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 timestamps)

        start_date_str = _format_datetime(start_date)
        end_date_str = _format_datetime(end_date)
        query = f"""
                SELECT
                    {field_string}
                    {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""}
                FROM (
                    SELECT {field_string},
                    ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS feast_row_
                    FROM {from_expression} t1
                    WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date_str}') AND TIMESTAMP('{end_date_str}')
                ) t2
                WHERE feast_row_ = 1
                """

        return SparkRetrievalJob(
            spark_session=spark_session,
            query=query,
            full_feature_names=False,
            on_demand_feature_views=None,
        )
Esempio n. 15
0
    def _to_data_source(cls, data_source):
        """
        Convert dict to data source.
        """

        source_type = SourceType(data_source.type).name

        if (
            source_type == "BATCH_FILE"
            and data_source.file_options.file_format
            and data_source.file_options.file_url
        ):
            data_source_options = FileOptions(
                file_format=data_source.file_options.file_format,
                file_url=data_source.file_options.file_url,
            )
        elif source_type == "BATCH_BIGQUERY" and data_source.bigquery_options.table_ref:
            data_source_options = BigQueryOptions(
                table_ref=data_source.bigquery_options.table_ref,
            )
        elif (
            source_type == "STREAM_KAFKA"
            and data_source.kafka_options.bootstrap_servers
            and data_source.kafka_options.topic
            and data_source.kafka_options.class_path
        ):
            data_source_options = KafkaOptions(
                bootstrap_servers=data_source.kafka_options.bootstrap_servers,
                class_path=data_source.kafka_options.class_path,
                topic=data_source.kafka_options.topic,
            )
        elif (
            source_type == "STREAM_KINESIS"
            and data_source.kinesis_options.class_path
            and data_source.kinesis_options.region
            and data_source.kinesis_options.stream_name
        ):
            data_source_options = KinesisOptions(
                class_path=data_source.kinesis_options.class_path,
                region=data_source.kinesis_options.region,
                stream_name=data_source.kinesis_options.stream_name,
            )
        else:
            raise ValueError("Could not identify the source type being added")

        data_source_proto = DataSource(
            type=data_source.type,
            field_mapping=data_source.field_mapping,
            options=data_source_options,
            timestamp_column=data_source.timestamp_column,
            date_partition_column=data_source.date_partition_column,
        ).to_proto()

        return data_source_proto
Esempio n. 16
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, RedshiftSource)
        assert isinstance(config.offline_store, RedshiftOfflineStoreConfig)

        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = (
                "PARTITION BY " + partition_by_join_key_string
            )
        timestamp_columns = [event_timestamp_column]
        if created_timestamp_column:
            timestamp_columns.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamp_columns) + " DESC"
        field_string = ", ".join(
            join_key_columns + feature_name_columns + timestamp_columns
        )

        redshift_client = aws_utils.get_redshift_data_client(
            config.offline_store.region
        )
        s3_resource = aws_utils.get_s3_resource(config.offline_store.region)

        query = f"""
            SELECT
                {field_string}
                {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM {from_expression}
                WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}'
            )
            WHERE _feast_row = 1
            """
        # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized
        return RedshiftRetrievalJob(
            query=query,
            redshift_client=redshift_client,
            s3_resource=s3_resource,
            config=config,
            full_feature_names=False,
            on_demand_feature_views=None,
        )
Esempio n. 17
0
    def GetHistoricalFeatures(self, request, context):
        """Produce a training dataset, return a job id that will provide a file reference"""
        job = self.client.get_historical_features(
            request.feature_refs,
            entity_source=DataSource.from_proto(request.entity_source),
            project=request.project,
            output_location=request.output_location,
        )

        output_file_uri = job.get_output_file_uri(block=False)

        return GetHistoricalFeaturesResponse(id=job.get_id(),
                                             output_file_uri=output_file_uri)
Esempio n. 18
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, PostgreSQLSource)
        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(
            _append_alias(join_key_columns, "a"))
        if partition_by_join_key_string != "":
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        timestamps = [event_timestamp_column]
        if created_timestamp_column:
            timestamps.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(_append_alias(timestamps,
                                                             "a")) + " DESC"
        a_field_string = ", ".join(
            _append_alias(join_key_columns + feature_name_columns + timestamps,
                          "a"))
        b_field_string = ", ".join(
            _append_alias(join_key_columns + feature_name_columns + timestamps,
                          "b"))

        query = f"""
            SELECT
                {b_field_string}
                {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""}
            FROM (
                SELECT {a_field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM ({from_expression}) a
                WHERE a."{event_timestamp_column}" BETWEEN '{start_date}'::timestamptz AND '{end_date}'::timestamptz
            ) b
            WHERE _feast_row = 1
            """

        return PostgreSQLRetrievalJob(
            query=query,
            config=config,
            full_feature_names=False,
            on_demand_feature_views=None,
        )
Esempio n. 19
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, BigQuerySource)
        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        timestamps = [event_timestamp_column]
        if created_timestamp_column:
            timestamps.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamps) + " DESC"
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 timestamps)

        client = _get_bigquery_client(
            project=config.offline_store.project_id,
            location=config.offline_store.location,
        )
        query = f"""
            SELECT
                {field_string}
                {f", {repr(DUMMY_ENTITY_VAL)} AS {DUMMY_ENTITY_ID}" if not join_key_columns else ""}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM {from_expression}
                WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}')
            )
            WHERE _feast_row = 1
            """

        # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized
        return BigQueryRetrievalJob(
            query=query,
            client=client,
            config=config,
            full_feature_names=False,
        )
Esempio n. 20
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, RedshiftSource)
        assert isinstance(config.offline_store, RedshiftOfflineStoreConfig)

        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        timestamp_columns = [event_timestamp_column]
        if created_timestamp_column:
            timestamp_columns.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamp_columns) + " DESC"
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 timestamp_columns)

        redshift_client = aws_utils.get_redshift_data_client(
            config.offline_store.region)
        s3_resource = aws_utils.get_s3_resource(config.offline_store.region)

        query = f"""
            SELECT {field_string}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM {from_expression}
                WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}'
            )
            WHERE _feast_row = 1
            """
        return RedshiftRetrievalJob(
            query=query,
            redshift_client=redshift_client,
            s3_resource=s3_resource,
            config=config,
        )
Esempio n. 21
0
    def GetHistoricalFeatures(self, request: GetHistoricalFeaturesRequest,
                              context):
        """Produce a training dataset, return a job id that will provide a file reference"""
        job = start_historical_feature_retrieval_job(
            client=self.client,
            project=request.project,
            entity_source=DataSource.from_proto(request.entity_source),
            feature_tables=self.client._get_feature_tables_from_feature_refs(
                list(request.feature_refs), request.project),
            output_format=request.output_format,
            output_path=request.output_location,
        )

        output_file_uri = job.get_output_file_uri(block=False)

        return GetHistoricalFeaturesResponse(id=job.get_id(),
                                             output_file_uri=output_file_uri)
Esempio n. 22
0
    def list_data_sources(self,
                          project: str,
                          allow_cache: bool = False) -> List[DataSource]:
        """
        Retrieve a list of data sources from the registry

        Args:
            project: Filter data source based on project name
            allow_cache: Whether to allow returning data sources from a cached registry

        Returns:
            List of data sources
        """
        registry_proto = self._get_registry_proto(allow_cache=allow_cache)
        data_sources = []
        for data_source_proto in registry_proto.data_sources:
            if data_source_proto.project == project:
                data_sources.append(DataSource.from_proto(data_source_proto))
        return data_sources
Esempio n. 23
0
    def test_feature_table_import_export_yaml(self):

        batch_source = DataSource(
            type=SourceType(1).name,
            field_mapping={
                "ride_distance": "ride_distance",
                "ride_duration": "ride_duration",
            },
            options=FileOptions(file_format="avro", file_url="data/test.avro"),
            timestamp_column="ts_col",
            date_partition_column="date_partition_col",
        )

        stream_source = DataSource(
            type=SourceType(3).name,
            field_mapping={
                "ride_distance": "ride_distance",
                "ride_duration": "ride_duration",
            },
            options=KafkaOptions(
                bootstrap_servers="localhost:9094",
                class_path="random/path/to/class",
                topic="test_topic",
            ),
            timestamp_column="ts_col",
        )

        test_feature_table = FeatureTable(
            name="car_driver",
            features=[
                FeatureV2(name="ride_distance",
                          dtype=ValueType.FLOAT).to_proto(),
                FeatureV2(name="ride_duration",
                          dtype=ValueType.STRING).to_proto(),
            ],
            entities=["car_driver_entity"],
            labels={"team": "matchmaking"},
            batch_source=batch_source.to_proto(),
            stream_source=stream_source.to_proto(),
        )

        # Create a string YAML representation of the feature table
        string_yaml = test_feature_table.to_yaml()

        # Create a new feature table object from the YAML string
        actual_feature_table_from_string = FeatureTable.from_yaml(string_yaml)

        # Ensure equality is upheld to original feature table
        assert test_feature_table == actual_feature_table_from_string
Esempio n. 24
0
    def get_data_source(self,
                        name: str,
                        project: str,
                        allow_cache: bool = False) -> DataSource:
        """
        Retrieves a data source.

        Args:
            name: Name of data source
            project: Feast project that this data source belongs to
            allow_cache: Whether to allow returning this data source from a cached registry

        Returns:
            Returns either the specified data source, or raises an exception if none is found
        """
        registry = self._get_registry_proto(allow_cache=allow_cache)

        for data_source in registry.data_sources:
            if data_source.project == project and data_source.name == name:
                return DataSource.from_proto(data_source)
        raise DataSourceObjectNotFoundException(name, project=project)
Esempio n. 25
0
    def pull_all_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        """
        Note that join_key_columns, feature_name_columns, event_timestamp_column, and
        created_timestamp_column have all already been mapped to column names of the
        source table and those column names are the values passed into this function.
        """
        assert isinstance(data_source, SparkSource)
        warnings.warn(
            "The spark offline store is an experimental feature in alpha development. "
            "This API is unstable and it could and most probably will be changed in the future.",
            RuntimeWarning,
        )

        spark_session = get_spark_session_or_start_new_with_repoconfig(
            store_config=config.offline_store)

        fields = ", ".join(join_key_columns + feature_name_columns +
                           [event_timestamp_column])
        from_expression = data_source.get_table_query_string()
        start_date = start_date.astimezone(tz=utc)
        end_date = end_date.astimezone(tz=utc)

        query = f"""
            SELECT {fields}
            FROM {from_expression}
            WHERE {event_timestamp_column} BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}'
        """

        return SparkRetrievalJob(spark_session=spark_session,
                                 query=query,
                                 full_feature_names=False)
Esempio n. 26
0
    def pull_latest_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        created_timestamp_column: Optional[str],
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, BigQuerySource)
        from_expression = data_source.get_table_query_string()

        partition_by_join_key_string = ", ".join(join_key_columns)
        if partition_by_join_key_string != "":
            partition_by_join_key_string = ("PARTITION BY " +
                                            partition_by_join_key_string)
        timestamps = [event_timestamp_column]
        if created_timestamp_column:
            timestamps.append(created_timestamp_column)
        timestamp_desc_string = " DESC, ".join(timestamps) + " DESC"
        field_string = ", ".join(join_key_columns + feature_name_columns +
                                 timestamps)

        client = _get_bigquery_client(project=config.offline_store.project_id)
        query = f"""
            SELECT {field_string}
            FROM (
                SELECT {field_string},
                ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS _feast_row
                FROM {from_expression}
                WHERE {event_timestamp_column} BETWEEN TIMESTAMP('{start_date}') AND TIMESTAMP('{end_date}')
            )
            WHERE _feast_row = 1
            """
        return BigQueryRetrievalJob(query=query, client=client, config=config)
Esempio n. 27
0
    def pull_all_from_table_or_query(
        config: RepoConfig,
        data_source: DataSource,
        join_key_columns: List[str],
        feature_name_columns: List[str],
        event_timestamp_column: str,
        start_date: datetime,
        end_date: datetime,
    ) -> RetrievalJob:
        assert isinstance(data_source, SnowflakeSource)
        from_expression = data_source.get_table_query_string()

        field_string = ('"' +
                        '", "'.join(join_key_columns + feature_name_columns +
                                    [event_timestamp_column]) + '"')

        if data_source.snowflake_options.warehouse:
            config.offline_store.warehouse = data_source.snowflake_options.warehouse

        snowflake_conn = get_snowflake_conn(config.offline_store)

        start_date = start_date.astimezone(tz=utc)
        end_date = end_date.astimezone(tz=utc)

        query = f"""
            SELECT {field_string}
            FROM {from_expression}
            WHERE "{event_timestamp_column}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}'
        """

        return SnowflakeRetrievalJob(
            query=query,
            snowflake_conn=snowflake_conn,
            config=config,
            full_feature_names=False,
        )