def get_feature_table_wu(self, ingest_table): """ Generate an MLFeatureTable workunit for a Feast feature table. Parameters ---------- ingest_table: ingested Feast table """ featuretable_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", ingest_table["name"]), aspects=[], ) featuretable_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( ingest_table["name"], feature["name"], ) for feature in ingest_table["features"] ], # a feature table can have multiple primary keys, which then act as a composite key mlPrimaryKeys=[ builder.make_ml_primary_key_urn(ingest_table["name"], entity["name"]) for entity in ingest_table["entities"] ], )) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=featuretable_snapshot) return MetadataWorkUnit(id=ingest_table["name"], mce=mce)
def _get_feature_workunit( self, feature_view: Union[FeatureView, OnDemandFeatureView], feature: Feature, ) -> MetadataWorkUnit: """ Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, feature.name), aspects=[StatusClass(removed=False)], ) feature_sources = [] if isinstance(feature_view, FeatureView): feature_sources = self._get_data_sources(feature_view) elif isinstance(feature_view, OnDemandFeatureView): if feature_view.input_request_data_sources is not None: for request_source in feature_view.input_request_data_sources.values(): source_platform, source_name = self._get_data_source_details( request_source ) feature_sources.append( builder.make_dataset_urn( source_platform, source_name, self.source_config.environment, ) ) if feature_view.input_feature_view_projections is not None: for ( feature_view_projection ) in feature_view.input_feature_view_projections.values(): feature_view_source = self.feature_store.get_feature_view( feature_view_projection.name ) feature_sources.extend(self._get_data_sources(feature_view_source)) feature_snapshot.aspects.append( MLFeaturePropertiesClass( description=feature.labels.get("description"), dataType=self._get_field_type(feature.dtype, feature.name), sources=feature_sources, ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=feature.name, mce=mce)
def get_feature_wu(self, ingest_table, ingest_feature): """ Generate an MLFeature workunit for a Feast feature. Parameters ---------- ingest_table: ingested Feast table ingest_feature: ingested Feast feature """ # create snapshot instance for the feature feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn( ingest_table["name"], ingest_feature["name"] ), aspects=[], ) feature_sources = [] if ingest_feature["batch_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["batch_source_platform"], ingest_feature["batch_source_name"], self.config.env, ) ) if ingest_feature["stream_source"] is not None: feature_sources.append( builder.make_dataset_urn( ingest_feature["stream_source_platform"], ingest_feature["stream_source_name"], self.config.env, ) ) # append feature name and type feature_snapshot.aspects.append( MLFeaturePropertiesClass( dataType=self.get_field_type( ingest_feature["type"], ingest_feature["name"] ), sources=feature_sources, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit(id=ingest_feature["name"], mce=mce)
def get_feature_group_wu( self, feature_group_details: Dict[str, Any] ) -> MetadataWorkUnit: """ Generate an MLFeatureTable workunit for a SageMaker feature group. Parameters ---------- feature_group_details: ingested SageMaker feature group from get_feature_group_details() """ feature_group_name = feature_group_details["FeatureGroupName"] feature_group_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("sagemaker", feature_group_name), aspects=[ BrowsePathsClass(paths=[f"sagemaker/{feature_group_name}"]), ], ) feature_group_snapshot.aspects.append( MLFeatureTablePropertiesClass( description=feature_group_details.get("Description"), # non-primary key features mlFeatures=[ builder.make_ml_feature_urn( feature_group_name, feature["FeatureName"], ) for feature in feature_group_details["FeatureDefinitions"] if feature["FeatureName"] != feature_group_details["RecordIdentifierFeatureName"] ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn( feature_group_name, feature_group_details["RecordIdentifierFeatureName"], ) ], # additional metadata customProperties={ "arn": feature_group_details["FeatureGroupArn"], "creation_time": str(feature_group_details["CreationTime"]), "status": feature_group_details["FeatureGroupStatus"], }, ) ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_group_snapshot) return MetadataWorkUnit(id=feature_group_name, mce=mce)
def _get_on_demand_feature_view_workunit( self, on_demand_feature_view: OnDemandFeatureView ) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast on-demand feature view. """ on_demand_feature_view_name = ( f"{self.feature_store.project}.{on_demand_feature_view.name}" ) on_demand_feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", on_demand_feature_view_name), aspects=[ BrowsePathsClass( paths=[ f"/feast/{self.feature_store.project}/{on_demand_feature_view_name}" ] ), StatusClass(removed=False), ], ) on_demand_feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( on_demand_feature_view_name, feature.name, ) for feature in on_demand_feature_view.features ], mlPrimaryKeys=[], ) ) mce = MetadataChangeEvent(proposedSnapshot=on_demand_feature_view_snapshot) return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)
def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkUnit: """ Generate an MLFeatureTable work unit for a Feast feature view. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", feature_view_name), aspects=[ BrowsePathsClass( paths=[f"/feast/{self.feature_store.project}/{feature_view_name}"] ), StatusClass(removed=False), ], ) feature_view_snapshot.aspects.append( MLFeatureTablePropertiesClass( mlFeatures=[ builder.make_ml_feature_urn( feature_view_name, feature.name, ) for feature in feature_view.features ], mlPrimaryKeys=[ builder.make_ml_primary_key_urn(feature_view_name, entity_name) for entity_name in feature_view.entities ], ) ) mce = MetadataChangeEvent(proposedSnapshot=feature_view_snapshot) return MetadataWorkUnit(id=feature_view_name, mce=mce)
def get_feature_wu( self, feature_group_details: Dict[str, Any], feature: Dict[str, Any] ) -> MetadataWorkUnit: """ Generate an MLFeature workunit for a SageMaker feature. Parameters ---------- feature_group_details: ingested SageMaker feature group from get_feature_group_details() feature: ingested SageMaker feature """ # if the feature acts as the record identifier, then we ingest it as an MLPrimaryKey # the RecordIdentifierFeatureName is guaranteed to exist as it's required on creation is_record_identifier = ( feature_group_details["RecordIdentifierFeatureName"] == feature["FeatureName"] ) feature_sources = [] if "OfflineStoreConfig" in feature_group_details: # remove S3 prefix (s3://) s3_name = feature_group_details["OfflineStoreConfig"]["S3StorageConfig"][ "S3Uri" ][5:] if s3_name.endswith("/"): s3_name = s3_name[:-1] feature_sources.append( builder.make_dataset_urn( "s3", s3_name, self.env, ) ) if "DataCatalogConfig" in feature_group_details["OfflineStoreConfig"]: # if Glue catalog associated with offline store glue_database = feature_group_details["OfflineStoreConfig"][ "DataCatalogConfig" ]["Database"] glue_table = feature_group_details["OfflineStoreConfig"][ "DataCatalogConfig" ]["TableName"] full_table_name = f"{glue_database}.{glue_table}" self.report.report_warning( full_table_name, f"""Note: table {full_table_name} is an AWS Glue object. To view full table metadata, run Glue ingestion (see https://datahubproject.io/docs/metadata-ingestion/#aws-glue-glue)""", ) feature_sources.append( f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})" ) # note that there's also an OnlineStoreConfig field, but this # lacks enough metadata to create a dataset # (only specifies the security config and whether it's enabled at all) # append feature name and type if is_record_identifier: primary_key_snapshot: MLPrimaryKeySnapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn( feature_group_details["FeatureGroupName"], feature["FeatureName"], ), aspects=[ MLPrimaryKeyPropertiesClass( dataType=self.get_feature_type( feature["FeatureType"], feature["FeatureName"] ), sources=feature_sources, ), ], ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=primary_key_snapshot) else: # create snapshot instance for the feature feature_snapshot: MLFeatureSnapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn( feature_group_details["FeatureGroupName"], feature["FeatureName"], ), aspects=[ MLFeaturePropertiesClass( dataType=self.get_feature_type( feature["FeatureType"], feature["FeatureName"] ), sources=feature_sources, ) ], ) # make the MCE and workunit mce = MetadataChangeEvent(proposedSnapshot=feature_snapshot) return MetadataWorkUnit( id=f'{feature_group_details["FeatureGroupName"]}-{feature["FeatureName"]}', mce=mce, )