Exemple #1
0
    def apply(
        self,
        objects: Union[Entity, FeatureView, FeatureService,
                       List[Union[FeatureView, Entity, FeatureService]], ],
    ) -> None:
        fs = self._build_feast_feature_store()
        fs.apply(objects)

        # Applying also initializes the sqlite tables in the online store
        FlyteContext.current_context().file_access.upload(
            self.config.online_store_path,
            f"s3://{self.config.s3_bucket}/{self.config.online_store_path}",
        )
Exemple #2
0
 def all(self, **kwargs) -> pyspark.sql.DataFrame:
     if self._fmt == SchemaFormat.PARQUET:
         ctx = FlyteContext.current_context().user_space_params
         return ctx.spark_session.read.parquet(self.from_path)
     raise AssertionError(
         "Only Parquet type files are supported for spark dataframe currently"
     )
Exemple #3
0
    def pre_execute(self,
                    user_params: ExecutionParameters) -> ExecutionParameters:
        import pyspark as _pyspark

        ctx = FlyteContext.current_context()
        sess_builder = _pyspark.sql.SparkSession.builder.appName(
            f"FlyteSpark: {user_params.execution_id}")
        if not (ctx.execution_state and ctx.execution_state.Mode
                == ExecutionState.Mode.TASK_EXECUTION):
            # If either of above cases is not true, then we are in local execution of this task
            # Add system spark-conf for local/notebook based execution.
            spark_conf = _pyspark.SparkConf()
            for k, v in self.task_config.spark_conf.items():
                spark_conf.set(k, v)
            # In local execution, propagate PYTHONPATH to executors too. This makes the spark
            # execution hermetic to the execution environment. For example, it allows running
            # Spark applications using Bazel, without major changes.
            if "PYTHONPATH" in os.environ:
                spark_conf.setExecutorEnv("PYTHONPATH",
                                          os.environ["PYTHONPATH"])
            sess_builder = sess_builder.config(conf=spark_conf)

        self.sess = sess_builder.getOrCreate()
        return user_params.builder().add_attr("SPARK_SESSION",
                                              self.sess).build()
Exemple #4
0
    def get_online_features(
        self,
        features: Union[List[str], FeatureService],
        entity_rows: List[Dict[str, Any]],
        feature_refs: Optional[List[str]] = None,
        full_feature_names: bool = False,
    ) -> Dict[str, Any]:
        FlyteContext.current_context().file_access.download(
            f"s3://{self.config.s3_bucket}/{self.config.online_store_path}",
            self.config.online_store_path,
        )
        fs = self._build_feast_feature_store()

        online_response = fs.get_online_features(features, entity_rows,
                                                 feature_refs,
                                                 full_feature_names)
        return online_response.to_dict()
Exemple #5
0
 def materialize(
     self,
     start_date: datetime,
     end_date: datetime,
     feature_views: Optional[List[str]] = None,
 ) -> None:
     FlyteContext.current_context().file_access.download(
         f"s3://{self.config.s3_bucket}/{self.config.online_store_path}",
         self.config.online_store_path,
     )
     fs = self._build_feast_feature_store()
     fs.materialize(
         start_date=start_date,
         end_date=end_date,
     )
     FlyteContext.current_context().file_access.upload(
         self.config.online_store_path,
         f"s3://{self.config.s3_bucket}/{self.config.online_store_path}",
     )
Exemple #6
0
 def __init__(self, base_dir: str = None):
     if base_dir is None:
         self._base_dir = (
             FlyteContext.current_context().user_space_params.working_directory
         )
         self._files = []
     else:
         self._base_dir = base_dir
         files = os.listdir(base_dir)
         self._files = [os.path.join(base_dir, f) for f in files]
Exemple #7
0
def test_parameter_ranges_transformer():
    t = ParameterRangesTransformer()
    assert t.get_literal_type(ParameterRangeOneOf) == Generic.to_flyte_literal_type()
    o = ParameterRangeOneOf(param=IntegerParameterRange(10, 0, 1))
    ctx = FlyteContext.current_context()
    lit = t.to_literal(ctx, python_val=o, python_type=ParameterRangeOneOf, expected=None)
    assert lit is not None
    assert lit.scalar.generic is not None
    ro = t.to_python_value(ctx, lit, ParameterRangeOneOf)
    assert ro is not None
    assert ro == o
Exemple #8
0
    def execute(self, **kwargs) -> typing.Any:
        with tempfile.TemporaryDirectory() as temp_dir:
            ctx = FlyteContext.current_context()
            file_ext = os.path.basename(self.task_config.uri)
            local_path = os.path.join(temp_dir, file_ext)
            ctx.file_access.download(self.task_config.uri, local_path)
            if self.task_config.compressed:
                local_path = unarchive_file(local_path, temp_dir)

            print(f"Connecting to db {local_path}")
            with contextlib.closing(sqlite3.connect(local_path)) as con:
                df = pd.read_sql_query(self.get_query(**kwargs), con)
                return df
Exemple #9
0
 def decode(
     self,
     ctx: FlyteContext,
     flyte_value: literals.StructuredDataset,
     current_task_metadata: StructuredDatasetMetadata,
 ) -> DataFrame:
     user_ctx = FlyteContext.current_context().user_space_params
     if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
         columns = [
             c.name
             for c in current_task_metadata.structured_dataset_type.columns
         ]
         return user_ctx.spark_session.read.parquet(
             flyte_value.uri).select(*columns)
     return user_ctx.spark_session.read.parquet(flyte_value.uri)
Exemple #10
0
    def execute_from_model(self, tt: task_models.TaskTemplate, **kwargs) -> typing.Any:
        with tempfile.TemporaryDirectory() as temp_dir:
            ctx = FlyteContext.current_context()
            file_ext = os.path.basename(tt.custom["uri"])
            local_path = os.path.join(temp_dir, file_ext)
            ctx.file_access.download(tt.custom["uri"], local_path)
            if tt.custom["compressed"]:
                local_path = unarchive_file(local_path, temp_dir)

            print(f"Connecting to db {local_path}")
            interpolated_query = SQLite3Task.interpolate_query(tt.custom["query_template"], **kwargs)
            print(f"Interpolated query {interpolated_query}")
            with contextlib.closing(sqlite3.connect(local_path)) as con:
                df = pd.read_sql_query(interpolated_query, con)
                return df
Exemple #11
0
    def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters:
        import pyspark as _pyspark

        ctx = FlyteContext.current_context()
        if not (ctx.execution_state and ctx.execution_state.Mode == ExecutionState.Mode.TASK_EXECUTION):
            # If either of above cases is not true, then we are in local execution of this task
            # Add system spark-conf for local/notebook based execution.
            spark_conf = set()
            for k, v in self.task_config.spark_conf.items():
                spark_conf.add((k, v))
            spark_conf.add(("spark.master", "local"))
            _pyspark.SparkConf().setAll(spark_conf)

        sess = _pyspark.sql.SparkSession.builder.appName(f"FlyteSpark: {user_params.execution_id}").getOrCreate()
        return user_params.builder().add_attr("SPARK_SESSION", sess).build()
Exemple #12
0
def test_hpoconfig_transformer():
    t = HPOTuningJobConfigTransformer()
    assert t.get_literal_type(HyperparameterTuningJobConfig) == Generic.to_flyte_literal_type()
    o = HyperparameterTuningJobConfig(
        tuning_strategy=1,
        tuning_objective=HyperparameterTuningObjective(
            objective_type=HyperparameterTuningObjectiveType.MINIMIZE,
            metric_name="x",
        ),
        training_job_early_stopping_type=TrainingJobEarlyStoppingType.OFF,
    )
    ctx = FlyteContext.current_context()
    lit = t.to_literal(ctx, python_val=o, python_type=HyperparameterTuningJobConfig, expected=None)
    assert lit is not None
    assert lit.scalar.generic is not None
    ro = t.to_python_value(ctx, lit, HyperparameterTuningJobConfig)
    assert ro is not None
    assert ro == o
Exemple #13
0
    def pre_execute(self, user_params: ExecutionParameters) -> ExecutionParameters:
        """
        Pre-execute for Sagemaker will automatically add the distributed context to the execution params, only
        if the number of execution instances is > 1. Otherwise this is considered to be a single node execution
        """
        if self._is_distributed():
            logging.info("Distributed context detected!")
            exec_state = FlyteContext.current_context().execution_state
            if exec_state and exec_state.mode == ExecutionState.Mode.TASK_EXECUTION:
                """
                This mode indicates we are actually in a remote execute environment (within sagemaker in this case)
                """
                dist_ctx = DistributedTrainingContext.from_env()
            else:
                dist_ctx = DistributedTrainingContext.local_execute()
            return user_params.builder().add_attr("DISTRIBUTED_TRAINING_CONTEXT", dist_ctx).build()

        return user_params
Exemple #14
0
def record_outputs(**kwargs) -> str:
    """
    Use this method to record outputs from a notebook.
    It will convert all outputs to a Flyte understandable format. For Files, Directories, please use FlyteFile or
    FlyteDirectory, or wrap up your paths in these decorators.
    """
    if kwargs is None:
        return ""

    m = {}
    ctx = FlyteContext.current_context()
    for k, v in kwargs.items():
        expected = TypeEngine.to_literal_type(type(v))
        lit = TypeEngine.to_literal(ctx,
                                    python_type=type(v),
                                    python_val=v,
                                    expected=expected)
        m[k] = lit
    return LiteralMap(literals=m).to_flyte_idl()
Exemple #15
0
    def execute(self, **kwargs) -> Any:
        """
        TODO: Figure out how to share FlyteContext ExecutionParameters with the notebook kernel (as notebook kernel
             is executed in a separate python process)
        For Spark, the notebooks today need to use the new_session or just getOrCreate session and get a handle to the
        singleton
        """
        logger.info(
            f"Hijacking the call for task-type {self.task_type}, to call notebook."
        )
        # Execute Notebook via Papermill.
        pm.execute_notebook(self._notebook_path,
                            self.output_notebook_path,
                            parameters=kwargs)  # type: ignore

        outputs = self.extract_outputs(self.output_notebook_path)
        self.render_nb_html(self.output_notebook_path,
                            self.rendered_output_path)

        m = {}
        if outputs:
            m = outputs.literals
        output_list = []
        for k, type_v in self.python_interface.outputs.items():
            if k == self._IMPLICIT_OP_NOTEBOOK:
                output_list.append(self.output_notebook_path)
            elif k == self._IMPLICIT_RENDERED_NOTEBOOK:
                output_list.append(self.rendered_output_path)
            elif k in m:
                v = TypeEngine.to_python_value(
                    ctx=FlyteContext.current_context(),
                    lv=m[k],
                    expected_python_type=type_v)
                output_list.append(v)
            else:
                raise RuntimeError(
                    f"Expected output {k} of type {v} not found in the notebook outputs"
                )

        return tuple(output_list)