Ejemplo n.º 1
0
    def test_remote_ingestion_job(self):
        """ Test wating for the remote ingestion job to complete """
        class MockServicer(JobServiceServicer):
            """
            The RemoteJob is expected to call GetJob until its done.
            This mock JobService returns RUNNING status on the first call, and DONE on the second.
            """

            _job_statuses = [
                JobStatus.JOB_STATUS_DONE, JobStatus.JOB_STATUS_RUNNING
            ]
            _call_count = defaultdict(int)

            def GetJob(self, request, context):

                self._call_count["GetJob"] += 1
                return GetJobResponse(job=JobProto(
                    id="test",
                    type=JobType.RETRIEVAL_JOB,
                    status=self._job_statuses.pop(),
                    retrieval=JobProto.RetrievalJobMeta(output_location="foo"),
                ))

        mock_servicer = MockServicer()
        with mock_server(mock_servicer) as service:
            remote_job = RemoteRetrievalJob(service, lambda: {}, "test", "foo",
                                            datetime.now(), None)

            assert remote_job.get_output_file_uri(timeout_sec=2) == "foo"
            assert mock_servicer._call_count["GetJob"] == 2
Ejemplo n.º 2
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[pd.DataFrame, FileSource, BigQuerySource],
        output_location: Optional[str] = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
                If entity_source is a Panda DataFrame, the dataframe will be staged
                to become accessible by spark workers.
                If one of feature tables' source is in BigQuery - entities will be upload to BQ.
                Otherwise to remote file storage (derived from configured staging location).
                It is also assumed that the column event_timestamp is present
                in the dataframe, and is of type datetime without timezone information.

                The user needs to make sure that the source (or staging location, if entity_source is
                a Panda DataFrame) is accessible from the Spark cluster that will be used for the
                retrieval job.
            destination_path: Specifies the path in a bucket to write the exported feature data files

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from feast.data_format import ParquetFormat
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", ParquetFormat(), "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source)
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, self.project)

        assert all(
            ft.batch_source.created_timestamp_column
            for ft in feature_tables), (
                "All BatchSources attached to retrieved FeatureTables "
                "must have specified `created_timestamp_column` to be used in "
                "historical dataset generation.")

        if output_location is None:
            output_location = os.path.join(
                self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_LOCATION),
                str(uuid.uuid4()),
            )
        output_format = self._config.get(opt.HISTORICAL_FEATURE_OUTPUT_FORMAT)
        feature_sources = [
            feature_table.batch_source for feature_table in feature_tables
        ]

        if isinstance(entity_source, pd.DataFrame):
            if any(
                    isinstance(source, BigQuerySource)
                    for source in feature_sources):
                first_bq_source = [
                    source for source in feature_sources
                    if isinstance(source, BigQuerySource)
                ][0]
                source_ref = table_reference_from_string(
                    first_bq_source.bigquery_options.table_ref)
                entity_source = stage_entities_to_bq(entity_source,
                                                     source_ref.project,
                                                     source_ref.dataset_id)
            else:
                entity_source = stage_entities_to_fs(
                    entity_source,
                    staging_location=self._config.get(
                        opt.SPARK_STAGING_LOCATION),
                    config=self._config,
                )

        if self._use_job_service:
            response = self._job_service.GetHistoricalFeatures(
                GetHistoricalFeaturesRequest(
                    feature_refs=feature_refs,
                    entity_source=entity_source.to_proto(),
                    project=self.project,
                    output_format=output_format,
                    output_location=output_location,
                ),
                **self._extra_grpc_params(),
            )
            return RemoteRetrievalJob(
                self._job_service,
                self._extra_grpc_params,
                response.id,
                output_file_uri=response.output_file_uri,
            )
        else:
            return start_historical_feature_retrieval_job(
                client=self,
                project=self.project,
                entity_source=entity_source,
                feature_tables=feature_tables,
                output_format=output_format,
                output_path=output_location,
            )
Ejemplo n.º 3
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[pd.DataFrame, FileSource, BigQuerySource],
        project: str = None,
        output_location: str = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
                If entity_source is a Panda DataFrame, the dataframe will be exported to the staging
                location as parquet file. It is also assumed that the column event_timestamp is present
                in the dataframe, and is of type datetime without timezone information.

                The user needs to make sure that the source (or staging location, if entity_source is
                a Panda DataFrame) is accessible from the Spark cluster that will be used for the
                retrieval job.
            project: Specifies the project that contains the feature tables
                which the requested features belong to.

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from feast.data_format import ParquetFormat
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", ParquetFormat(), "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source, project="my_project")
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, project)

        if output_location is None:
            output_location = os.path.join(
                self._config.get(
                    CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION),
                str(uuid.uuid4()),
            )
        output_format = self._config.get(
            CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT)

        if isinstance(entity_source, pd.DataFrame):
            staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION)
            entity_staging_uri = urlparse(
                os.path.join(staging_location, str(uuid.uuid4())))
            staging_client = get_staging_client(entity_staging_uri.scheme)
            with tempfile.NamedTemporaryFile() as df_export_path:
                entity_source.to_parquet(df_export_path.name)
                bucket = (None if entity_staging_uri.scheme == "file" else
                          entity_staging_uri.netloc)
                staging_client.upload_file(df_export_path.name, bucket,
                                           entity_staging_uri.path.lstrip("/"))
                entity_source = FileSource(
                    "event_timestamp",
                    ParquetFormat(),
                    entity_staging_uri.geturl(),
                )

        if self._use_job_service:
            response = self._job_service.GetHistoricalFeatures(
                GetHistoricalFeaturesRequest(
                    feature_refs=feature_refs,
                    entity_source=entity_source.to_proto(),
                    project=project,
                    output_location=output_location,
                ),
                **self._extra_grpc_params(),
            )
            return RemoteRetrievalJob(
                self._job_service,
                self._extra_grpc_params,
                response.id,
                output_file_uri=response.output_file_uri,
            )
        else:
            return start_historical_feature_retrieval_job(
                self,
                entity_source,
                feature_tables,
                output_format,
                os.path.join(output_location, str(uuid.uuid4())),
            )