def test_feature_store_create(sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions feature_group.create( s3_uri=s3_uri, record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role_arn, enable_online_store=True, ) sagemaker_session_mock.create_feature_group.assert_called_with( feature_group_name="MyFeatureGroup", record_identifier_name="feature1", event_time_feature_name="feature2", feature_definitions=[ fd.to_dict() for fd in feature_group_dummy_definitions ], role_arn=role_arn, description=None, tags=None, online_store_config={"EnableOnlineStore": True}, offline_store_config={ "DisableGlueTableCreation": False, "S3StorageConfig": { "S3Uri": s3_uri }, }, )
def save_to_feature_store(): logger.info("Save to FeatureStore started") global feature_group df_data = pd.read_csv(feature_s3_url) logger.info("Read data from S3: %s", df_data.head()) feature_store_session = Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime) # You can modify the following to use a bucket of your choosing logger.info("Default bucket: %s", default_bucket) # record identifier and event time feature names record_identifier_feature_name = "IDpol" event_time_feature_name = "EventTime" current_time_sec = int(round(time.time())) # cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type. cast_object_to_string(df_data) df_data[event_time_feature_name] = pd.Series([current_time_sec] * len(df_data), dtype="float64") feature_group_name = 'insurance-policy-feature-group-' + strftime( '%d-%H-%M-%S', gmtime()) logger.info("Feature Group Name: %s", feature_group_name) # Check if feature group already exists. Create a feature group if doesn't exist. if feature_group_exist(feature_group_name) == False: logger.info("Feature Group: %s doesn't exist. Create a new one.", feature_group) feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) # append EventTime feature # load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data. feature_group.load_feature_definitions(data_frame=df_data) # output is suppressed feature_group.create( s3_uri=f"s3://{default_bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=get_execution_role(), enable_online_store=True) wait_for_feature_group_creation_complete(feature_group=feature_group) feature_group.describe() else: logger.info("Feature Group: %s exits", feature_group) # Init feature group object if already exists feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) # ingest data into feature store feature_group.ingest(data_frame=df_data, max_workers=5, wait=True)
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records feature_definitions = [ FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL), FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL), # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING), ] feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session ) print("Feature Group: {}".format(feature_group)) try: print( "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..." ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print("Before CREATE FG wait exeption: {}".format(e)) # pass try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print("Creating Feature Group with role {}...".format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True, ) print("Creating Feature Group. Completed.") print("Waiting for new Feature Group to become available...") wait_for_feature_group_creation_complete(feature_group) print("Feature Group available.") feature_group.describe() except Exception as e: print("Exception: {}".format(e)) return feature_group
def test_ingest_without_string_feature( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame_without_string, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions( data_frame=pandas_data_frame_without_string) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) ingestion_manager = feature_group.ingest( data_frame=pandas_data_frame_without_string, max_workers=3, wait=False) ingestion_manager.wait() assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def test_ingest_multi_process( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, max_processes=2, wait=True) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def create_feature_group( feature_group_name, feature_group_description, df, id_name, event_time_name, offline_feature_group_bucket, sagemaker_session, role, ): """ Create a new FeatureGroup. :param feature_group_name: str :param feature_group_description: str :param df: pandas.DataFrame :param id_name: str :param event_time_name: str :param offline_feature_group_bucket: str :param sagemaker_session: sagemaker.Session() :param role: str :return: tuple(FeatureGroup, bool) """ feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session) feature_definitions = get_feature_definitions(df, feature_group) feature_group.feature_definitions = feature_definitions feature_group_already_exists = False try: print(f"Trying to create feature group {feature_group_description} \n") feature_group.create( description=feature_group_description, record_identifier_name=id_name, event_time_feature_name=event_time_name, role_arn=role, s3_uri=offline_feature_group_bucket, enable_online_store=True, ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: code = e.response.get("Error").get("Code") if code == "ResourceInUse": print(f"Using existing feature group: {feature_group_name}") feature_group_already_exists = True else: raise (e) return feature_group, feature_group_already_exists
def test_create_feature_store( feature_store_session, role, feature_group_name, offline_store_s3_uri, pandas_data_frame, record, create_table_ddl, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) # Ingest data feature_group.put_record(record=record) ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame, max_workers=3, wait=False) ingestion_manager.wait() assert 0 == len(ingestion_manager.failed_rows) # Query the integrated Glue table. athena_query = feature_group.athena_query() df = DataFrame() with timeout(minutes=10): while df.shape[0] < 11: athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() print(f"Found {df.shape[0]} records.") time.sleep(60) assert df.shape[0] == 11 nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8, 9])]["feature4"]) for is_na in nans.items(): assert is_na assert (create_table_ddl.format( feature_group_name=feature_group_name, region=feature_store_session.boto_session.region_name, account=feature_store_session.account_id(), ) == feature_group.as_hive_ddl()) assert output["FeatureGroupArn"].endswith( f"feature-group/{feature_group_name}")
def test_feature_store_create( sagemaker_session_mock, role_arn, feature_group_dummy_definitions, s3_uri ): feature_group = FeatureGroup(name="MyFeatureGroup", sagemaker_session=sagemaker_session_mock) feature_group.feature_definitions = feature_group_dummy_definitions feature_group.create( s3_uri=s3_uri, record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role_arn, enable_online_store=True, ) assert sagemaker_session_mock.create_feature_group.called_with( feature_group_name="MyFeatureGroup", record_identifier_name="feature1", event_time_feature_name="feature2", role_arn=role_arn, online_store_config={"EnableOnlineStore": True}, feature_definitions=[fd.to_dict() for fd in feature_group_dummy_definitions], )
def test_create_feature_store_online_only( feature_store_session, role, feature_group_name, pandas_data_frame, ): feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session) feature_group.load_feature_definitions(data_frame=pandas_data_frame) with cleanup_feature_group(feature_group): output = feature_group.create( s3_uri=False, record_identifier_name="feature1", event_time_feature_name="feature3", role_arn=role, enable_online_store=True, ) _wait_for_feature_group_create(feature_group) assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}")
data['review_id'] = data['review_id'].astype('str').astype('string') data['product_id'] = data['product_id'].astype('str').astype('string') data['review_body'] = data['review_body'].astype('str').astype('string') data['label'] = data['label'].astype('str').astype('string') data['star_rating'] = data['star_rating'].astype('int64') data['event_time'] = data['event_time'].astype('float64') # Load feature definitions feature_group.load_feature_definitions(data_frame=data) # Create feature group feature_group.create( s3_uri='s3://{}/{}'.format(bucket, prefix), record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True, description="1.8M+ tokenized camera reviews from the Amazon Customer Reviews dataset", tags=[ { 'Key': 'Dataset', 'Value': 'amazon customer reviews' }, { 'Key': 'Subset', 'Value': 'cameras' }, { 'Key': 'Owner', 'Value': 'Julien Simon' } ] ) # Wait for feature group to be ready while feature_group.describe().get("FeatureGroupStatus") != 'Created': sleep(1) print('Feature group created') # Ingest data print('Ingesting data...') try: feature_group.ingest(data_frame=data, max_workers=max_workers, wait=True)
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for our records feature_definitions = [ FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='input_mask', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='segment_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.INTEGRAL), FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label', feature_type=FeatureTypeEnum.INTEGRAL), # FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) ] feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session) print('Feature Group: {}'.format(feature_group)) try: print( 'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...' ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print('Before CREATE FG wait exeption: {}'.format(e)) # pass try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print('Creating Feature Group with role {}...'.format(role)) feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=True) print('Creating Feature Group. Completed.') print('Waiting for new Feature Group to become available...') wait_for_feature_group_creation_complete(feature_group) print('Feature Group available.') feature_group.describe() except Exception as e: print('Exception: {}'.format(e)) # pass # print('FAILED - NOW Creating Feature Group with service-role {}...'.format('arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole')) # feature_group.create( # s3_uri=f"s3://{bucket}/{prefix}", # record_identifier_name=record_identifier_feature_name, # event_time_feature_name=event_time_feature_name, # role_arn='arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole', # enable_online_store=True # ) # print('Creating Feature Group. Completed.') # feature_group.describe() return feature_group
def create_or_load_feature_group(prefix, feature_group_name): # Feature Definitions for the records feature_definitions = [ FeatureDefinition(feature_name='review_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='date', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='sentiment', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='label_id', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='input_ids', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING), FeatureDefinition(feature_name='split_type', feature_type=FeatureTypeEnum.STRING) ] # setup the Feature Group feature_group = FeatureGroup(name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session) print('Feature Group: {}'.format(feature_group)) try: print( 'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...' ) wait_for_feature_group_creation_complete(feature_group) except Exception as e: print('Before CREATE FG wait exeption: {}'.format(e)) try: record_identifier_feature_name = "review_id" event_time_feature_name = "date" print('Creating Feature Group with role {}...'.format(role)) # create Feature Group feature_group.create( s3_uri=f"s3://{bucket}/{prefix}", record_identifier_name=record_identifier_feature_name, event_time_feature_name=event_time_feature_name, role_arn=role, enable_online_store=False) print('Creating Feature Group. Completed.') print('Waiting for new Feature Group to become available...') wait_for_feature_group_creation_complete(feature_group) print('Feature Group available.') # the information about the Feature Group feature_group.describe() except Exception as e: print('Exception: {}'.format(e)) return feature_group
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") input_name = "features.csv" input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv") input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(), "py-sdk-ingestion-test-input/features.csv") with open(input_file_path, "r") as data: body = data.read() S3Uploader.upload_string_as_file_body( body=body, desired_s3_uri=input_data_uri, sagemaker_session=sagemaker_session) inputs = [ ProcessingInput( input_name=input_name, source=input_data_uri, destination="/opt/ml/processing/features.csv", ) ] feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}" feature_group = FeatureGroup( name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=feature_store_session, ) ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input( input_name, input_data_uri, s3_content_type="csv", s3_has_header=True, ) outputs = [ ProcessingOutput( output_name=output_name, app_managed=True, feature_store_output=FeatureStoreOutput( feature_group_name=feature_group_name), ) ] temp_flow_path = "./ingestion.flow" with cleanup_feature_group(feature_group): json.dump(ingestion_only_flow, open(temp_flow_path, "w")) data_wrangler_processor = DataWranglerProcessor( role=role, data_wrangler_flow_source=temp_flow_path, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, max_runtime_in_seconds=86400, ) data_wrangler_step = ProcessingStep(name="ingestion-step", processor=data_wrangler_processor, inputs=inputs, outputs=outputs) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[data_wrangler_step], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] offline_store_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), feature_group_name) feature_group.create( s3_uri=offline_store_s3_uri, record_identifier_name="f11", event_time_feature_name="f10", role_arn=role, enable_online_store=False, ) _wait_for_feature_group_create(feature_group) execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=60, max_attempts=10) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "ingestion-step" assert execution_steps[0]["StepStatus"] == "Succeeded" athena_query = feature_group.athena_query() with timeout(minutes=10): athena_query.run( query_string=f'SELECT * FROM "{athena_query.table_name}"', output_location=f"{offline_store_s3_uri}/query_results", ) athena_query.wait() assert "SUCCEEDED" == athena_query.get_query_execution().get( "QueryExecution").get("Status").get("State") df = athena_query.as_dataframe() assert pd.read_csv(input_file_path).shape[0] == df.shape[0] finally: try: pipeline.delete() except Exception as e: print(f"Delete pipeline failed with error: {e}") os.remove(temp_flow_path)
class FeatureGroupDataSet(AbstractDataSet): def __init__( self, name: str, s3_uri: str, record_identifier_name: str, event_time_name: str, query: str, description: str = None, ): region = boto3.Session().region_name boto_session = boto3.Session(region_name=region) sagemaker_client = boto_session.client( service_name="sagemaker", region_name=region ) featurestore_runtime = boto_session.client( service_name="sagemaker-featurestore-runtime", region_name=region ) feature_store_session = Session( boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime, ) iam = boto3.client("iam") role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"] # you can also suffix the feature group name with pipeline git version self._feature_group = FeatureGroup( name=name, sagemaker_session=feature_store_session ) self._description = description self._s3_uri = s3_uri self._role = role self._record_identifier_name = record_identifier_name self._event_time_name = event_time_name self._query = query def _wait_for_feature_group_creation_complete(self): status = self._feature_group.describe().get("FeatureGroupStatus") while status == "Creating": logger.info("Waiting for Feature Group Creation") time.sleep(5) status = self._feature_group.describe().get("FeatureGroupStatus") if status != "Created": raise RuntimeError( f"Failed to create feature group {self._feature_group.name}" ) logger.info("FeatureGroup %s successfully created.", self._feature_group.name) def _describe(self): return dict(feature_group=self._feature_group) def _save(self, data): self._feature_group.load_feature_definitions(data) try: self._feature_group.create( description=self._description, s3_uri=self._s3_uri, record_identifier_name=self._record_identifier_name, event_time_feature_name=self._event_time_name, role_arn=self._role, enable_online_store=True, ) self._wait_for_feature_group_creation_complete() except Exception as exc: if ( f"Resource Already Exists: FeatureGroup with name {self._feature_group.name} already exists" in str(exc) ): pass else: raise self._feature_group.ingest(data[:10]) # just for demo purpose def _load(self) -> pd.DataFrame: query = self._feature_group.athena_query() print(self._query.format(table_name=query.table_name)) query.run( self._query.format(table_name=query.table_name), output_location=f"{self._s3_uri}/query_results/", ) query.wait() return query.as_dataframe()