Ejemplo n.º 1
0
def stage_dataframe(df, event_timestamp_column: str, config: Config) -> FileSource:
    """
    Helper function to upload a pandas dataframe in parquet format to a temporary location (under
    SPARK_STAGING_LOCATION) and return it wrapped in a FileSource.

    Args:
        event_timestamp_column(str): the name of the timestamp column in the dataframe.
        config(Config): feast config.
    """
    staging_location = config.get(opt.SPARK_STAGING_LOCATION)
    staging_uri = urlparse(staging_location)

    with tempfile.NamedTemporaryFile() as f:
        df.to_parquet(f)

        file_url = urlunparse(
            get_staging_client(staging_uri.scheme, config).upload_fileobj(
                f,
                f.name,
                remote_path_prefix=os.path.join(staging_location, "dataframes"),
                remote_path_suffix=".parquet",
            )
        )

    return FileSource(
        event_timestamp_column=event_timestamp_column,
        file_format=ParquetFormat(),
        file_url=file_url,
    )
Ejemplo n.º 2
0
def _k8s_launcher(config: Config) -> JobLauncher:
    from feast_spark.pyspark.launchers import k8s

    staging_location = config.get(opt.SPARK_STAGING_LOCATION)
    staging_uri = urlparse(staging_location)

    return k8s.KubernetesJobLauncher(
        namespace=config.get(opt.SPARK_K8S_NAMESPACE),
        generic_resource_template_path=config.get(opt.SPARK_K8S_JOB_TEMPLATE_PATH),
        batch_ingestion_resource_template_path=config.get(
            opt.SPARK_K8S_BATCH_INGESTION_TEMPLATE_PATH, None
        ),
        stream_ingestion_resource_template_path=config.get(
            opt.SPARK_K8S_STREAM_INGESTION_TEMPLATE_PATH, None
        ),
        historical_retrieval_resource_template_path=config.get(
            opt.SPARK_K8S_HISTORICAL_RETRIEVAL_TEMPLATE_PATH, None
        ),
        staging_location=staging_location,
        incluster=config.getboolean(opt.SPARK_K8S_USE_INCLUSTER_CONFIG),
        staging_client=get_staging_client(staging_uri.scheme, config),
        # azure-related arguments are None if not using Azure blob storage
        azure_account_name=config.get(opt.AZURE_BLOB_ACCOUNT_NAME, None),
        azure_account_key=config.get(opt.AZURE_BLOB_ACCOUNT_ACCESS_KEY, None),
    )
Ejemplo n.º 3
0
def stage_entities_to_fs(entity_source: pd.DataFrame, staging_location: str,
                         config: Config) -> FileSource:
    """
    Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location)

    :return: FileSource with remote destination path
    """
    entity_staging_uri = urlparse(
        os.path.join(staging_location, str(uuid.uuid4())))
    staging_client = get_staging_client(entity_staging_uri.scheme, config)
    with tempfile.NamedTemporaryFile() as df_export_path:
        # prevent casting ns -> ms exception inside pyarrow
        entity_source["event_timestamp"] = entity_source[
            "event_timestamp"].dt.floor("ms")

        entity_source.to_parquet(df_export_path.name)

        with open(df_export_path.name, "rb") as f:
            staging_client.upload_fileobj(f,
                                          df_export_path.name,
                                          remote_uri=entity_staging_uri)

    # ToDo: support custom event_timestamp_column
    return FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=entity_staging_uri.geturl(),
    )
Ejemplo n.º 4
0
    def historical_feature_retrieval(
            self, job_params: RetrievalJobParameters) -> RetrievalJob:

        with open(job_params.get_main_file_path()) as f:
            pyspark_script = f.read()

        pyspark_script_path = urlunparse(
            get_staging_client("s3").upload_fileobj(
                BytesIO(pyspark_script.encode("utf8")),
                local_path="historical_retrieval.py",
                remote_path_prefix=self._staging_location,
                remote_path_suffix=".py",
            ))

        step = _historical_retrieval_step(
            pyspark_script_path,
            args=job_params.get_arguments(),
            output_file_uri=job_params.get_destination_path(),
        )

        job_ref = self._submit_emr_job(step)

        return EmrRetrievalJob(
            self._emr_client(),
            job_ref,
            job_params.get_destination_path(),
        )
Ejemplo n.º 5
0
def _upload_to_file_source(file_url: str, with_partitions: bool,
                           dest_path: str) -> None:
    """
    Uploads data into a FileSource. Currently supports GCS, S3 and Local FS.

    Args:
        file_url: file url of FileSource defined for FeatureTable
    """
    from urllib.parse import urlparse

    uri = urlparse(file_url)
    staging_client = get_staging_client(uri.scheme)

    if with_partitions:
        for path in glob.glob(os.path.join(dest_path, "**/*")):
            file_name = path.split("/")[-1]
            partition_col = path.split("/")[-2]
            staging_client.upload_file(
                path,
                uri.hostname,
                str(uri.path).strip("/") + "/" + partition_col + "/" +
                file_name,
            )
    else:
        file_name = dest_path.split("/")[-1]
        staging_client.upload_file(
            dest_path,
            uri.hostname,
            str(uri.path).strip("/") + "/" + file_name,
        )
Ejemplo n.º 6
0
def _upload_jar(jar_s3_prefix: str, local_path: str) -> str:
    with open(local_path, "rb") as f:
        uri = urlparse(os.path.join(jar_s3_prefix, os.path.basename(local_path)))
        return urlunparse(
            get_staging_client(uri.scheme).upload_fileobj(
                f, local_path, remote_uri=uri,
            )
        )
Ejemplo n.º 7
0
    def _stage_file(self, file_path: str, job_id: str) -> str:
        if not os.path.isfile(file_path):
            return file_path

        staging_client = get_staging_client("gs")
        blob_path = os.path.join(self.remote_path, job_id, os.path.basename(file_path),)
        staging_client.upload_file(file_path, self.staging_bucket, blob_path)

        return f"gs://{self.staging_bucket}/{blob_path}"
Ejemplo n.º 8
0
def _upload_jar(jar_s3_prefix: str, jar_path: str) -> str:
    if jar_path.startswith("https://"):
        return jar_path
    with open(jar_path, "rb") as f:
        uri = urlparse(os.path.join(jar_s3_prefix, os.path.basename(jar_path)))
        return urlunparse(
            get_staging_client(uri.scheme).upload_fileobj(f,
                                                          jar_path,
                                                          remote_uri=uri))
Ejemplo n.º 9
0
    def _stage_files(self, pyspark_script: str, job_id: str) -> str:
        staging_client = get_staging_client("gs")
        blob_path = os.path.join(
            self.remote_path,
            job_id,
            os.path.basename(pyspark_script),
        )
        staging_client.upload_file(pyspark_script, self.staging_bucket,
                                   blob_path)

        return f"gs://{self.staging_bucket}/{blob_path}"
Ejemplo n.º 10
0
def _download_jar(remote_jar: str) -> str:
    remote_jar_parts = urlparse(remote_jar)

    local_temp_jar = tempfile.NamedTemporaryFile(suffix=".jar", delete=False)
    with local_temp_jar:
        shutil.copyfileobj(
            get_staging_client(remote_jar_parts.scheme).download_file(remote_jar_parts),
            local_temp_jar,
        )

    return local_temp_jar.name
Ejemplo n.º 11
0
    def _stage_file(self, file_path: str, job_id: str) -> str:
        if not os.path.isfile(file_path):
            return file_path

        staging_client = get_staging_client("gs")
        blob_path = os.path.join(
            self.remote_path, job_id, os.path.basename(file_path),
        ).lstrip("/")
        blob_uri_str = f"gs://{self.staging_bucket}/{blob_path}"
        with open(file_path, "rb") as f:
            staging_client.upload_fileobj(
                f, file_path, remote_uri=urlparse(blob_uri_str)
            )

        return blob_uri_str
Ejemplo n.º 12
0
def _k8s_launcher(config: Config) -> JobLauncher:
    from feast.pyspark.launchers import k8s

    staging_location = config.get(opt.SPARK_STAGING_LOCATION)
    staging_uri = urlparse(staging_location)

    return k8s.KubernetesJobLauncher(
        namespace=config.get(opt.SPARK_K8S_NAMESPACE),
        resource_template_path=config.get(opt.SPARK_K8S_JOB_TEMPLATE_PATH,
                                          None),
        staging_location=staging_location,
        incluster=config.getboolean(opt.SPARK_K8S_USE_INCLUSTER_CONFIG),
        staging_client=get_staging_client(staging_uri.scheme, config),
        # azure-related arguments are None if not using Azure blob storage
        azure_account_name=config.get(opt.AZURE_BLOB_ACCOUNT_NAME),
        azure_account_key=config.get(opt.AZURE_BLOB_ACCOUNT_ACCESS_KEY),
    )
Ejemplo n.º 13
0
def _upload_to_file_source(
    file_url: str, with_partitions: bool, dest_path: str, config: Config
) -> None:
    """
    Uploads data into a FileSource. Currently supports GCS, S3 and Local FS.

    Args:
        file_url: file url of FileSource defined for FeatureTable
        with_partitions: whether to treat dest_path as dir with partitioned table
        dest_path: path to file or dir to be uploaded
        config: Config instance to configure FileSource
    """
    from urllib.parse import urlparse

    uri = urlparse(file_url)
    staging_client = get_staging_client(uri.scheme, config)

    if with_partitions:
        for path in glob.glob(os.path.join(dest_path, "**/*")):
            file_name = path.split("/")[-1]
            partition_col = path.split("/")[-2]
            with open(path, "rb") as f:
                staging_client.upload_fileobj(
                    f,
                    path,
                    remote_uri=uri._replace(
                        path=str(uri.path).rstrip("/")
                        + "/"
                        + partition_col
                        + "/"
                        + file_name
                    ),
                )
    else:
        file_name = dest_path.split("/")[-1]
        with open(dest_path, "rb") as f:
            staging_client.upload_fileobj(
                f,
                dest_path,
                remote_uri=uri._replace(
                    path=str(uri.path).rstrip("/") + "/" + file_name
                ),
            )
Ejemplo n.º 14
0
    def stage_dataframe(self, df: pandas.DataFrame, event_timestamp: str) -> FileSource:
        with tempfile.NamedTemporaryFile() as f:
            df.to_parquet(f)

            file_url = urlunparse(
                get_staging_client("s3").upload_fileobj(
                    f,
                    f.name,
                    remote_path_prefix=os.path.join(
                        self._staging_location, "dataframes"
                    ),
                    remote_path_suffix=".parquet",
                )
            )

        return FileSource(
            event_timestamp_column=event_timestamp,
            file_format=ParquetFormat(),
            file_url=file_url,
        )
Ejemplo n.º 15
0
Archivo: job.py Proyecto: vjrkr/feast
    def result(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])):
        """
        Wait until job is done to get an iterable rows of result. The row can
        only represent an Avro row in Feast 0.3.

        Args:
            timeout_sec (int):
                Max no of seconds to wait until job is done. If "timeout_sec"
                is exceeded, an exception will be raised.

        Returns:
            Iterable of Avro rows.
        """
        uris = self.get_avro_files(timeout_sec)
        for file_uri in uris:
            file_obj = get_staging_client(file_uri.scheme).download_file(file_uri)
            file_obj.seek(0)
            avro_reader = fastavro.reader(file_obj)

            for record in avro_reader:
                yield record
Ejemplo n.º 16
0
def apply_validation(
    client: "Client",
    feature_table: "FeatureTable",
    udf: ValidationUDF,
    validation_window_secs: int,
    include_py_libs=_UNSET,
):
    """
    Uploads validation udf code to staging location &
    stores path to udf code and required python libraries as FeatureTable labels.
    """
    include_py_libs = (
        include_py_libs if include_py_libs is not _UNSET else GE_PACKED_ARCHIVE
    )

    staging_location = client._config.get(ConfigOptions.SPARK_STAGING_LOCATION).rstrip(
        "/"
    )
    staging_scheme = urlparse(staging_location).scheme
    staging_client = get_staging_client(staging_scheme, client._config)

    pickled_code_fp = io.BytesIO(udf.pickled_code)
    remote_path = f"{staging_location}/udfs/{feature_table.name}/{udf.name}.pickle"
    staging_client.upload_fileobj(
        pickled_code_fp, f"{udf.name}.pickle", remote_uri=urlparse(remote_path)
    )

    feature_table.labels.update(
        {
            "_validation": json.dumps(
                dict(
                    name=udf.name,
                    pickled_code_path=remote_path,
                    include_archive_path=include_py_libs,
                )
            ),
            "_streaming_trigger_secs": str(validation_window_secs),
        }
    )
    client.apply_feature_table(feature_table)
Ejemplo n.º 17
0
def stage_entities_to_fs(entity_source: pd.DataFrame,
                         staging_location: str) -> FileSource:
    """
    Dumps given (entities) dataframe as parquet file and stage it to remote file storage (subdirectory of staging_location)

    :return: FileSource with remote destination path
    """
    entity_staging_uri = urlparse(
        os.path.join(staging_location, str(uuid.uuid4())))
    staging_client = get_staging_client(entity_staging_uri.scheme)
    with tempfile.NamedTemporaryFile() as df_export_path:
        entity_source.to_parquet(df_export_path.name)
        bucket = (None if entity_staging_uri.scheme == "file" else
                  entity_staging_uri.netloc)
        staging_client.upload_file(df_export_path.name, bucket,
                                   entity_staging_uri.path.lstrip("/"))

    # ToDo: support custom event_timestamp_column
    return FileSource(
        event_timestamp_column="event_timestamp",
        file_format=ParquetFormat(),
        file_url=entity_staging_uri.geturl(),
    )
Ejemplo n.º 18
0
def export_source_to_staging_location(
    source: Union[pd.DataFrame, str], staging_location_uri: str
) -> List[str]:
    """
    Uploads a DataFrame as an Avro file to a remote staging location.

    The local staging location specified in this function is used for E2E
    tests, please do not use it.

    Args:
        source (Union[pd.DataFrame, str]:
            Source of data to be staged. Can be a pandas DataFrame or a file
            path.

            Only four types of source are allowed:
                * Pandas DataFrame
                * Local Avro file
                * GCS Avro file
                * S3 Avro file
                * Azure Blob storage Avro file


        staging_location_uri (str):
            Remote staging location where DataFrame should be written.
            Examples:
                * gs://bucket/path/
                * s3://bucket/path/
                * wasbs://bucket@account_name.blob.core.windows.net/path/
                * file:///data/subfolder/

    Returns:
        List[str]:
            Returns a list containing the full path to the file(s) in the
            remote staging location.
    """

    uri = urlparse(staging_location_uri)

    # Prepare Avro file to be exported to staging location
    if isinstance(source, pd.DataFrame):
        # Remote gs staging location provided by serving
        dir_path, file_name, source_path = export_dataframe_to_local(df=source)
    elif isinstance(source, str):
        source_uri = urlparse(source)
        if source_uri.scheme in ["", "file"]:
            # Local file provided as a source
            dir_path = ""
            file_name = os.path.basename(source)
            source_path = os.path.abspath(
                os.path.join(source_uri.netloc, source_uri.path)
            )
        else:
            # gs, s3, azure blob file provided as a source.
            assert source_uri.hostname is not None
            return get_staging_client(source_uri.scheme).list_files(uri=source_uri)
    else:
        raise Exception(
            f"Only string and DataFrame types are allowed as a "
            f"source, {type(source)} was provided."
        )

    # Push data to required staging location
    with open(source_path, "rb") as f:
        get_staging_client(uri.scheme).upload_fileobj(
            f,
            source_path,
            remote_uri=uri._replace(path=str(uri.path).strip("/") + "/" + file_name),
        )

    # Clean up, remove local staging file
    if dir_path and isinstance(source, pd.DataFrame) and len(dir_path) > 4:
        shutil.rmtree(dir_path)

    return [staging_location_uri.rstrip("/") + "/" + file_name]
Ejemplo n.º 19
0
 def _get_staging_client(self):
     uri = urlparse(self._staging_location)
     return get_staging_client(uri.scheme)
Ejemplo n.º 20
0
    def get_historical_features(
        self,
        feature_refs: List[str],
        entity_source: Union[pd.DataFrame, FileSource, BigQuerySource],
        project: str = None,
    ) -> RetrievalJob:
        """
        Launch a historical feature retrieval job.

        Args:
            feature_refs: List of feature references that will be returned for each entity.
                Each feature reference should have the following format:
                "feature_table:feature" where "feature_table" & "feature" refer to
                the feature and feature table names respectively.
            entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
                If entity_source is a Panda DataFrame, the dataframe will be exported to the staging
                location as parquet file. It is also assumed that the column event_timestamp is present
                in the dataframe, and is of type datetime without timezone information.

                The user needs to make sure that the source (or staging location, if entity_source is
                a Panda DataFrame) is accessible from the Spark cluster that will be used for the
                retrieval job.
            project: Specifies the project that contains the feature tables
                which the requested features belong to.

        Returns:
                Returns a retrieval job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>> feast_client = Client(core_url="localhost:6565")
            >>> feature_refs = ["bookings:bookings_7d", "bookings:booking_14d"]
            >>> entity_source = FileSource("event_timestamp", "parquet", "gs://some-bucket/customer")
            >>> feature_retrieval_job = feast_client.get_historical_features(
            >>>     feature_refs, entity_source, project="my_project")
            >>> output_file_uri = feature_retrieval_job.get_output_file_uri()
                "gs://some-bucket/output/
        """
        feature_tables = self._get_feature_tables_from_feature_refs(
            feature_refs, project)
        output_location = os.path.join(
            self._config.get(CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_LOCATION),
            str(uuid.uuid4()),
        )
        output_format = self._config.get(
            CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT)

        if isinstance(entity_source, pd.DataFrame):
            staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION)
            entity_staging_uri = urlparse(
                os.path.join(staging_location, str(uuid.uuid4())))
            staging_client = get_staging_client(entity_staging_uri.scheme)
            with tempfile.NamedTemporaryFile() as df_export_path:
                entity_source.to_parquet(df_export_path.name)
                bucket = (None if entity_staging_uri.scheme == "file" else
                          entity_staging_uri.netloc)
                staging_client.upload_file(df_export_path.name, bucket,
                                           entity_staging_uri.path.lstrip("/"))
                entity_source = FileSource(
                    "event_timestamp",
                    "created_timestamp",
                    ParquetFormat(),
                    entity_staging_uri.geturl(),
                )

        return start_historical_feature_retrieval_job(
            self,
            entity_source,
            feature_tables,
            output_format,
            os.path.join(output_location, str(uuid.uuid4())),
        )