Esempio n. 1
0
    def __init__(
        self, repo_path: Optional[str] = None, config: Optional[RepoConfig] = None,
    ):
        """ Initializes a new FeatureStore object. Used to manage a feature store.

        Args:
            repo_path: Path to a `feature_store.yaml` used to configure the feature store
            config (RepoConfig): Configuration object used to configure the feature store
        """
        if repo_path is not None and config is not None:
            raise ValueError("You cannot specify both repo_path and config")
        if config is not None:
            self.repo_path = Path(os.getcwd())
            self.config = config
        elif repo_path is not None:
            self.repo_path = Path(repo_path)
            self.config = load_repo_config(Path(repo_path))
        else:
            raise ValueError("Please specify one of repo_path or config")

        registry_config = self.config.get_registry_config()
        self._registry = Registry(
            registry_path=registry_config.path,
            repo_path=self.repo_path,
            cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
        )
        self._tele = Telemetry()
Esempio n. 2
0
    def __init__(
        self, repo_path: Optional[str] = None, config: Optional[RepoConfig] = None,
    ):
        self.repo_path = repo_path
        if repo_path is not None and config is not None:
            raise ValueError("You cannot specify both repo_path and config")
        if config is not None:
            self.config = config
        elif repo_path is not None:
            self.config = load_repo_config(Path(repo_path))
        else:
            self.config = RepoConfig(
                registry="./registry.db",
                project="default",
                provider="local",
                online_store=OnlineStoreConfig(
                    local=LocalOnlineStoreConfig(path="online_store.db")
                ),
            )

        registry_config = self.config.get_registry_config()
        self._registry = Registry(
            registry_path=registry_config.path,
            cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
        )
        self._tele = Telemetry()
Esempio n. 3
0
    def __init__(
        self,
        repo_path: Optional[str] = None,
        config: Optional[RepoConfig] = None,
    ):
        """
        Creates a FeatureStore object.

        Raises:
            ValueError: If both or neither of repo_path and config are specified.
        """
        if repo_path is not None and config is not None:
            raise ValueError("You cannot specify both repo_path and config.")
        if config is not None:
            self.repo_path = Path(os.getcwd())
            self.config = config
        elif repo_path is not None:
            self.repo_path = Path(repo_path)
            self.config = load_repo_config(Path(repo_path))
        else:
            raise ValueError("Please specify one of repo_path or config.")

        registry_config = self.config.get_registry_config()
        self._registry = Registry(
            registry_path=registry_config.path,
            repo_path=self.repo_path,
            cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
        )
Esempio n. 4
0
def registry_dump(repo_config: RepoConfig, repo_path: Path):
    """For debugging only: output contents of the metadata registry"""
    registry_config = repo_config.get_registry_config()
    project = repo_config.project
    registry = Registry(registry_config=registry_config, repo_path=repo_path)
    registry_dict = registry.to_dict(project=project)

    click.echo(json.dumps(registry_dict, indent=2, sort_keys=True))
Esempio n. 5
0
def teardown(repo_config: RepoConfig, repo_path: Path):
    registry = Registry(repo_config.metadata_store)
    project = repo_config.project
    registry_tables: List[Union[FeatureTable, FeatureView]] = []
    registry_tables.extend(registry.list_feature_tables(project=project))
    registry_tables.extend(registry.list_feature_views(project=project))
    infra_provider = get_provider(repo_config)
    infra_provider.teardown_infra(project, tables=registry_tables)
Esempio n. 6
0
def registry_dump(repo_config: RepoConfig, repo_path: Path):
    """ For debugging only: output contents of the metadata registry """
    registry_config = repo_config.get_registry_config()
    project = repo_config.project
    registry = Registry(registry_config=registry_config, repo_path=repo_path)

    for entity in registry.list_entities(project=project):
        print(entity)
    for feature_view in registry.list_feature_views(project=project):
        print(feature_view)
Esempio n. 7
0
def registry_dump(repo_config: RepoConfig):
    """ For debugging only: output contents of the metadata registry """

    project = repo_config.project
    registry = Registry(repo_config.metadata_store)

    for entity in registry.list_entities(project=project):
        print(entity)
    for table in registry.list_feature_tables(project=project):
        print(table)
Esempio n. 8
0
def get_feature_view_query_context(
    feature_refs: List[str],
    feature_views: List[FeatureView],
    registry: Registry,
    project: str,
) -> List[FeatureViewQueryContext]:
    """Build a query context containing all information required to template a BigQuery and Redshift point-in-time SQL query"""

    (
        feature_views_to_feature_map,
        on_demand_feature_views_to_features,
    ) = _get_requested_feature_views_to_features_dict(
        feature_refs, feature_views, registry.list_on_demand_feature_views(project)
    )

    query_context = []
    for feature_view, features in feature_views_to_feature_map.items():
        join_keys = []
        entity_selections = []
        reverse_field_mapping = {
            v: k for k, v in feature_view.input.field_mapping.items()
        }
        for entity_name in feature_view.entities:
            entity = registry.get_entity(entity_name, project)
            join_key = feature_view.projection.join_key_map.get(
                entity.join_key, entity.join_key
            )
            join_keys.append(join_key)
            entity_selections.append(f"{entity.join_key} AS {join_key}")

        if isinstance(feature_view.ttl, timedelta):
            ttl_seconds = int(feature_view.ttl.total_seconds())
        else:
            ttl_seconds = 0

        event_timestamp_column = feature_view.input.event_timestamp_column
        created_timestamp_column = feature_view.input.created_timestamp_column

        context = FeatureViewQueryContext(
            name=feature_view.projection.name_to_use(),
            ttl=ttl_seconds,
            entities=join_keys,
            features=features,
            event_timestamp_column=reverse_field_mapping.get(
                event_timestamp_column, event_timestamp_column
            ),
            created_timestamp_column=reverse_field_mapping.get(
                created_timestamp_column, created_timestamp_column
            ),
            # TODO: Make created column optional and not hardcoded
            table_subquery=feature_view.input.get_table_query_string(),
            entity_selections=entity_selections,
        )
        query_context.append(context)
    return query_context
Esempio n. 9
0
def apply_total(repo_config: RepoConfig, repo_path: Path):
    os.chdir(repo_path)
    sys.path.append("")

    project = repo_config.project
    registry = Registry(repo_config.metadata_store)
    repo = parse_repo(repo_path)

    for entity in repo.entities:
        registry.apply_entity(entity, project=project)

    repo_table_names = set(t.name for t in repo.feature_tables)
    tables_to_delete = []
    for registry_table in registry.list_feature_tables(project=project):
        if registry_table.name not in repo_table_names:
            tables_to_delete.append(registry_table)

    # Delete tables that should not exist
    for registry_table in tables_to_delete:
        registry.delete_feature_table(registry_table.name, project=project)

    for table in repo.feature_tables:
        registry.apply_feature_table(table, project)

    infra_provider = get_provider(repo_config)
    infra_provider.update_infra(project,
                                tables_to_delete=tables_to_delete,
                                tables_to_keep=repo.feature_tables)

    print("Done!")
Esempio n. 10
0
def teardown(repo_config: RepoConfig, repo_path: Path):
    registry_config = repo_config.get_registry_config()
    registry = Registry(
        registry_path=registry_config.path,
        cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
    )
    project = repo_config.project
    registry_tables: List[Union[FeatureTable, FeatureView]] = []
    registry_tables.extend(registry.list_feature_tables(project=project))
    registry_tables.extend(registry.list_feature_views(project=project))
    infra_provider = get_provider(repo_config)
    infra_provider.teardown_infra(project, tables=registry_tables)
Esempio n. 11
0
def registry_dump(repo_config: RepoConfig):
    """ For debugging only: output contents of the metadata registry """
    registry_config = repo_config.get_registry_config()
    project = repo_config.project
    registry = Registry(
        registry_path=registry_config.path,
        cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
    )

    for entity in registry.list_entities(project=project):
        print(entity)
    for table in registry.list_feature_tables(project=project):
        print(table)
Esempio n. 12
0
def registry_dump(repo_config: RepoConfig, repo_path: Path):
    """ For debugging only: output contents of the metadata registry """
    from colorama import Fore, Style

    registry_config = repo_config.get_registry_config()
    project = repo_config.project
    registry = Registry(registry_config=registry_config, repo_path=repo_path)
    registry_dict = registry.to_dict(project=project)

    warning = (
        "Warning: The registry-dump command is for debugging only and may contain "
        "breaking changes in the future. No guarantees are made on this interface."
    )
    click.echo(f"{Style.BRIGHT}{Fore.YELLOW}{warning}{Style.RESET_ALL}")
    click.echo(json.dumps(registry_dict, indent=2))
Esempio n. 13
0
def s3_registry():
    registry_config = RegistryConfig(
        path=
        f"s3://feast-integration-tests/registries/{int(time.time() * 1000)}/registry.db",
        cache_ttl_seconds=600,
    )
    return Registry(registry_config, None)
Esempio n. 14
0
    def refresh_registry(self):
        """Fetches and caches a copy of the feature registry in memory.

        Explicitly calling this method allows for direct control of the state of the registry cache. Every time this
        method is called the complete registry state will be retrieved from the remote registry store backend
        (e.g., GCS, S3), and the cache timer will be reset. If refresh_registry() is run before get_online_features()
        is called, then get_online_feature() will use the cached registry instead of retrieving (and caching) the
        registry itself.

        Additionally, the TTL for the registry cache can be set to infinity (by setting it to 0), which means that
        refresh_registry() will become the only way to update the cached registry. If the TTL is set to a value
        greater than 0, then once the cache becomes stale (more time than the TTL has passed), a new cache will be
        downloaded synchronously, which may increase latencies if the triggering method is get_online_features()
        """
        registry_config = self.config.get_registry_config()
        self._registry = Registry(registry_config, repo_path=self.repo_path)
        self._registry.refresh()
Esempio n. 15
0
File: gcp.py Progetto: smarthi/feast
    def materialize_single_feature_view(
        self,
        feature_view: FeatureView,
        start_date: datetime,
        end_date: datetime,
        registry: Registry,
        project: str,
        tqdm_builder: Callable[[int], tqdm],
    ) -> None:
        entities = []
        for entity_name in feature_view.entities:
            entities.append(registry.get_entity(entity_name, project))

        (
            join_key_columns,
            feature_name_columns,
            event_timestamp_column,
            created_timestamp_column,
        ) = _get_column_names(feature_view, entities)

        start_date = utils.make_tzaware(start_date)
        end_date = utils.make_tzaware(end_date)

        table = self.offline_store.pull_latest_from_table_or_query(
            data_source=feature_view.input,
            join_key_columns=join_key_columns,
            feature_name_columns=feature_name_columns,
            event_timestamp_column=event_timestamp_column,
            created_timestamp_column=created_timestamp_column,
            start_date=start_date,
            end_date=end_date,
        )

        if feature_view.input.field_mapping is not None:
            table = _run_field_mapping(table, feature_view.input.field_mapping)

        join_keys = [entity.join_key for entity in entities]
        rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys)

        with tqdm_builder(len(rows_to_write)) as pbar:
            self.online_write_batch(
                project, feature_view, rows_to_write, lambda x: pbar.update(x)
            )

        feature_view.materialization_intervals.append((start_date, end_date))
        registry.apply_feature_view(feature_view, project)
Esempio n. 16
0
    def materialize_single_feature_view(
        self,
        feature_view: FeatureView,
        start_date: datetime,
        end_date: datetime,
        registry: Registry,
        project: str,
    ) -> None:
        assert isinstance(feature_view.input, BigQuerySource)

        entities = []
        for entity_name in feature_view.entities:
            entities.append(registry.get_entity(entity_name, project))

        (
            join_key_columns,
            feature_name_columns,
            event_timestamp_column,
            created_timestamp_column,
        ) = _get_column_names(feature_view, entities)

        start_date = utils.make_tzaware(start_date)
        end_date = utils.make_tzaware(end_date)

        offline_store = get_offline_store_from_sources([feature_view.input])
        table = offline_store.pull_latest_from_table_or_query(
            data_source=feature_view.input,
            join_key_columns=join_key_columns,
            feature_name_columns=feature_name_columns,
            event_timestamp_column=event_timestamp_column,
            created_timestamp_column=created_timestamp_column,
            start_date=start_date,
            end_date=end_date,
        )

        if feature_view.input.field_mapping is not None:
            table = _run_field_mapping(table, feature_view.input.field_mapping)

        join_keys = [entity.join_key for entity in entities]
        rows_to_write = _convert_arrow_to_proto(table, feature_view, join_keys)

        self.online_write_batch(project, feature_view, rows_to_write, None)

        feature_view.materialization_intervals.append((start_date, end_date))
        registry.apply_feature_view(feature_view, project)
Esempio n. 17
0
def _get_join_keys(project: str, feature_views: List[FeatureView],
                   registry: Registry) -> Set[str]:
    join_keys = set()
    for feature_view in feature_views:
        entities = feature_view.entities
        for entity_name in entities:
            entity = registry.get_entity(entity_name, project)
            join_keys.add(entity.join_key)
    return join_keys
Esempio n. 18
0
def _tag_registry_services_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[List[FeatureService], List[FeatureService]]:
    services_to_keep: List[FeatureService] = repo.feature_services
    services_to_delete: List[FeatureService] = []
    repo_feature_service_names = set(t.name for t in repo.feature_services)
    for registry_service in registry.list_feature_services(project=project):
        if registry_service.name not in repo_feature_service_names:
            services_to_delete.append(registry_service)
    return services_to_keep, services_to_delete
Esempio n. 19
0
def _tag_registry_tables_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[List[FeatureTable], List[FeatureTable]]:
    tables_to_keep: List[FeatureTable] = repo.feature_tables
    tables_to_delete: List[FeatureTable] = []
    repo_table_names = set(t.name for t in repo.feature_tables)
    for registry_table in registry.list_feature_tables(project=project):
        if registry_table.name not in repo_table_names:
            tables_to_delete.append(registry_table)
    return tables_to_keep, tables_to_delete
Esempio n. 20
0
def _tag_registry_views_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[List[FeatureView], List[FeatureView]]:
    views_to_keep: List[FeatureView] = repo.feature_views
    views_to_delete: List[FeatureView] = []
    repo_feature_view_names = set(t.name for t in repo.feature_views)
    for registry_view in registry.list_feature_views(project=project):
        if registry_view.name not in repo_feature_view_names:
            views_to_delete.append(registry_view)
    return views_to_keep, views_to_delete
Esempio n. 21
0
def _tag_registry_entities_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[List[Entity], List[Entity]]:
    entities_to_keep: List[Entity] = repo.entities
    entities_to_delete: List[Entity] = []
    repo_entities_names = set([e.name for e in repo.entities])
    for registry_entity in registry.list_entities(project=project):
        if registry_entity.name not in repo_entities_names:
            entities_to_delete.append(registry_entity)
    return entities_to_keep, entities_to_delete
Esempio n. 22
0
    def __init__(
        self, repo_path: Optional[str] = None, config: Optional[RepoConfig] = None,
    ):
        if repo_path is not None and config is not None:
            raise ValueError("You cannot specify both repo_path and config")
        if config is not None:
            self.repo_path = Path(os.getcwd())
            self.config = config
        elif repo_path is not None:
            self.repo_path = Path(repo_path)
            self.config = load_repo_config(Path(repo_path))
        else:
            raise ValueError("Please specify one of repo_path or config")

        registry_config = self.config.get_registry_config()
        self._registry = Registry(
            registry_path=registry_config.path,
            repo_path=self.repo_path,
            cache_ttl=timedelta(seconds=registry_config.cache_ttl_seconds),
        )
Esempio n. 23
0
def _tag_registry_on_demand_feature_views_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[List[OnDemandFeatureView], List[OnDemandFeatureView]]:
    odfvs_to_keep: List[OnDemandFeatureView] = repo.on_demand_feature_views
    odfvs_to_delete: List[OnDemandFeatureView] = []
    repo_on_demand_feature_view_names = set(
        t.name for t in repo.on_demand_feature_views
    )
    for registry_odfv in registry.list_on_demand_feature_views(project=project):
        if registry_odfv.name not in repo_on_demand_feature_view_names:
            odfvs_to_delete.append(registry_odfv)
    return odfvs_to_keep, odfvs_to_delete
Esempio n. 24
0
def get_expected_join_keys(project: str,
                           feature_views: List["feast.FeatureView"],
                           registry: Registry) -> Set[str]:
    join_keys = set()
    for feature_view in feature_views:
        entities = feature_view.entities
        for entity_name in entities:
            entity = registry.get_entity(entity_name, project)
            join_key = feature_view.projection.join_key_map.get(
                entity.join_key, entity.join_key)
            join_keys.add(join_key)
    return join_keys
Esempio n. 25
0
def _tag_registry_views_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[Set[BaseFeatureView], Set[BaseFeatureView]]:
    views_to_keep: Set[BaseFeatureView] = cast(Set[BaseFeatureView], repo.feature_views)
    for request_fv in repo.request_feature_views:
        views_to_keep.add(request_fv)
    views_to_delete: Set[BaseFeatureView] = set()
    repo_feature_view_names = set(t.name for t in repo.feature_views)
    for registry_view in registry.list_feature_views(project=project):
        if registry_view.name not in repo_feature_view_names:
            views_to_delete.add(registry_view)
    return views_to_keep, views_to_delete
Esempio n. 26
0
def gcs_registry():
    from google.cloud import storage

    storage_client = storage.Client()
    bucket_name = f"feast-registry-test-{int(time.time() * 1000)}"
    bucket = storage_client.bucket(bucket_name)
    bucket = storage_client.create_bucket(bucket)
    bucket.add_lifecycle_delete_rule(
        age=14)  # delete buckets automatically after 14 days
    bucket.patch()
    bucket.blob("registry.db")
    return Registry(f"gs://{bucket_name}/registry.db", None, timedelta(600))
Esempio n. 27
0
    def materialize_single_feature_view(
        self,
        config: RepoConfig,
        feature_view: FeatureView,
        start_date: datetime,
        end_date: datetime,
        registry: Registry,
        project: str,
        tqdm_builder: Callable[[int], tqdm],
    ) -> None:
        set_usage_attribute("provider", self.__class__.__name__)

        entities = []
        for entity_name in feature_view.entities:
            entities.append(registry.get_entity(entity_name, project))

        (
            join_key_columns,
            feature_name_columns,
            event_timestamp_column,
            created_timestamp_column,
        ) = _get_column_names(feature_view, entities)

        offline_job = self.offline_store.pull_latest_from_table_or_query(
            config=config,
            data_source=feature_view.batch_source,
            join_key_columns=join_key_columns,
            feature_name_columns=feature_name_columns,
            event_timestamp_column=event_timestamp_column,
            created_timestamp_column=created_timestamp_column,
            start_date=start_date,
            end_date=end_date,
        )

        table = offline_job.to_arrow()

        if feature_view.batch_source.field_mapping is not None:
            table = _run_field_mapping(table,
                                       feature_view.batch_source.field_mapping)

        join_keys = {entity.join_key: entity.value_type for entity in entities}

        with tqdm_builder(table.num_rows) as pbar:
            for batch in table.to_batches(DEFAULT_BATCH_SIZE):
                rows_to_write = _convert_arrow_to_proto(
                    batch, feature_view, join_keys)
                self.online_write_batch(
                    self.repo_config,
                    feature_view,
                    rows_to_write,
                    lambda x: pbar.update(x),
                )
Esempio n. 28
0
def apply_feature_services(registry: Registry, project: str, repo: ParsedRepo):
    from colorama import Fore, Style

    # Determine which feature services should be deleted.
    existing_feature_services = registry.list_feature_services(project)
    for feature_service in repo.feature_services:
        if feature_service in existing_feature_services:
            existing_feature_services.remove(feature_service)

    # The remaining features services in the list should be deleted.
    for feature_service_to_delete in existing_feature_services:
        registry.delete_feature_service(feature_service_to_delete.name,
                                        project)
        click.echo(
            f"Deleted feature service {Style.BRIGHT + Fore.GREEN}{feature_service_to_delete.name}{Style.RESET_ALL} "
            f"from registry")

    for feature_service in repo.feature_services:
        registry.apply_feature_service(feature_service, project=project)
        click.echo(
            f"Registered feature service {Style.BRIGHT + Fore.GREEN}{feature_service.name}{Style.RESET_ALL}"
        )
Esempio n. 29
0
def _tag_registry_entities_for_keep_delete(
    project: str, registry: Registry, repo: ParsedRepo
) -> Tuple[Set[Entity], Set[Entity]]:
    entities_to_keep: Set[Entity] = repo.entities
    entities_to_delete: Set[Entity] = set()
    repo_entities_names = set([e.name for e in repo.entities])
    for registry_entity in registry.list_entities(project=project):
        # Do not delete dummy entity.
        if (
            registry_entity.name not in repo_entities_names
            and registry_entity.name != DUMMY_ENTITY_NAME
        ):
            entities_to_delete.add(registry_entity)
    return entities_to_keep, entities_to_delete
Esempio n. 30
0
def gcs_registry() -> Registry:
    from google.cloud import storage

    storage_client = storage.Client()
    bucket_name = f"feast-registry-test-{int(time.time() * 1000)}"
    bucket = storage_client.bucket(bucket_name)
    bucket = storage_client.create_bucket(bucket)
    bucket.add_lifecycle_delete_rule(
        age=14)  # delete buckets automatically after 14 days
    bucket.patch()
    bucket.blob("registry.db")
    registry_config = RegistryConfig(path=f"gs://{bucket_name}/registry.db",
                                     cache_ttl_seconds=600)
    return Registry(registry_config, None)