def test_ingest_multi_process(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        feature_group.ingest(data_frame=pandas_data_frame,
                             max_workers=3,
                             max_processes=2,
                             wait=True)

    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
def test_ingest_without_string_feature(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame_without_string,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(
        data_frame=pandas_data_frame_without_string)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature2",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        ingestion_manager = feature_group.ingest(
            data_frame=pandas_data_frame_without_string,
            max_workers=3,
            wait=False)
        ingestion_manager.wait()

    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
def test_create_feature_store(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
    record,
    create_table_ddl,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        # Ingest data
        feature_group.put_record(record=record)
        ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame,
                                                 max_workers=3,
                                                 wait=False)
        ingestion_manager.wait()
        assert 0 == len(ingestion_manager.failed_rows)

        # Query the integrated Glue table.
        athena_query = feature_group.athena_query()
        df = DataFrame()
        with timeout(minutes=10):
            while df.shape[0] < 11:
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")
                df = athena_query.as_dataframe()
                print(f"Found {df.shape[0]} records.")
                time.sleep(60)

        assert df.shape[0] == 11
        nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8,
                                                   9])]["feature4"])
        for is_na in nans.items():
            assert is_na
        assert (create_table_ddl.format(
            feature_group_name=feature_group_name,
            region=feature_store_session.boto_session.region_name,
            account=feature_store_session.account_id(),
        ) == feature_group.as_hive_ddl())
    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
Beispiel #4
0
def save_to_feature_store():
    logger.info("Save to FeatureStore started")
    global feature_group

    df_data = pd.read_csv(feature_s3_url)
    logger.info("Read data from S3: %s", df_data.head())

    feature_store_session = Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_featurestore_runtime_client=featurestore_runtime)
    # You can modify the following to use a bucket of your choosing
    logger.info("Default bucket: %s", default_bucket)

    # record identifier and event time feature names
    record_identifier_feature_name = "IDpol"
    event_time_feature_name = "EventTime"
    current_time_sec = int(round(time.time()))
    # cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
    cast_object_to_string(df_data)
    df_data[event_time_feature_name] = pd.Series([current_time_sec] *
                                                 len(df_data),
                                                 dtype="float64")

    feature_group_name = 'insurance-policy-feature-group-' + strftime(
        '%d-%H-%M-%S', gmtime())
    logger.info("Feature Group Name: %s", feature_group_name)

    # Check if feature group already exists. Create a feature group if doesn't exist.
    if feature_group_exist(feature_group_name) == False:
        logger.info("Feature Group: %s doesn't exist. Create a new one.",
                    feature_group)

        feature_group = FeatureGroup(name=feature_group_name,
                                     sagemaker_session=feature_store_session)

        # append EventTime feature
        # load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
        feature_group.load_feature_definitions(data_frame=df_data)
        # output is suppressed
        feature_group.create(
            s3_uri=f"s3://{default_bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=get_execution_role(),
            enable_online_store=True)

        wait_for_feature_group_creation_complete(feature_group=feature_group)
        feature_group.describe()
    else:
        logger.info("Feature Group: %s exits", feature_group)
        # Init feature group object if already exists
        feature_group = FeatureGroup(name=feature_group_name,
                                     sagemaker_session=feature_store_session)

    # ingest data into feature store
    feature_group.ingest(data_frame=df_data, max_workers=5, wait=True)
Beispiel #5
0
def test_load_feature_definition_unsupported_types(sagemaker_session_mock):
    feature_group = FeatureGroup(name="FailedGroup",
                                 sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame({
        "float": pd.Series([2.0], dtype="float64"),
        "int": pd.Series([2], dtype="int64"),
        "object": pd.Series(["f1"], dtype="object"),
    })
    with pytest.raises(ValueError) as error:
        feature_group.load_feature_definitions(data_frame=df)
    assert "Failed to infer Feature type based on dtype object for column object." in str(
        error)
Beispiel #6
0
def test_create_feature_store_online_only(
    feature_store_session,
    role,
    feature_group_name,
    pandas_data_frame,
):
    feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=False,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

    assert output["FeatureGroupArn"].endswith(f"feature-group/{feature_group_name}")
Beispiel #7
0
def test_load_feature_definition(sagemaker_session_mock):
    feature_group = FeatureGroup(name="SomeGroup", sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame(
        {
            "float": pd.Series([2.0], dtype="float64"),
            "int": pd.Series([2], dtype="int64"),
            "string": pd.Series(["f1"], dtype="string"),
        }
    )
    feature_definitions = feature_group.load_feature_definitions(data_frame=df)
    names = [fd.feature_name for fd in feature_definitions]
    types = [fd.feature_type for fd in feature_definitions]
    assert names == ["float", "int", "string"]
    assert types == [FeatureTypeEnum.FRACTIONAL, FeatureTypeEnum.INTEGRAL, FeatureTypeEnum.STRING]
Beispiel #8
0
 feature_group = FeatureGroup(name=fg_name, sagemaker_session=session)
 # Define the name of the column storing a unique record id (e.g. primary key)
 record_identifier_feature_name = 'review_id'
 # Add a column to store feature timestamps
 event_time_feature_name = 'event_time'
 current_time_sec = int(round(time()))
 data = data.assign(event_time=current_time_sec)
 # Set the correct type for each column
 data['review_id']     = data['review_id'].astype('str').astype('string')
 data['product_id']    = data['product_id'].astype('str').astype('string')
 data['review_body']   = data['review_body'].astype('str').astype('string')
 data['label']         = data['label'].astype('str').astype('string')
 data['star_rating']   = data['star_rating'].astype('int64')
 data['event_time']    = data['event_time'].astype('float64')
 # Load feature definitions
 feature_group.load_feature_definitions(data_frame=data)
 # Create feature group
 feature_group.create(
     s3_uri='s3://{}/{}'.format(bucket, prefix),
     record_identifier_name=record_identifier_feature_name,
     event_time_feature_name=event_time_feature_name,
     role_arn=role,
     enable_online_store=True,
     description="1.8M+ tokenized camera reviews from the Amazon Customer Reviews dataset",
     tags=[
         { 'Key': 'Dataset', 'Value': 'amazon customer reviews' },
         { 'Key': 'Subset', 'Value': 'cameras' },
         { 'Key': 'Owner', 'Value': 'Julien Simon' }
     ]
 )
 # Wait for feature group to be ready
Beispiel #9
0
class FeatureGroupDataSet(AbstractDataSet):
    def __init__(
        self,
        name: str,
        s3_uri: str,
        record_identifier_name: str,
        event_time_name: str,
        query: str,
        description: str = None,
    ):

        region = boto3.Session().region_name
        boto_session = boto3.Session(region_name=region)

        sagemaker_client = boto_session.client(
            service_name="sagemaker", region_name=region
        )
        featurestore_runtime = boto_session.client(
            service_name="sagemaker-featurestore-runtime", region_name=region
        )

        feature_store_session = Session(
            boto_session=boto_session,
            sagemaker_client=sagemaker_client,
            sagemaker_featurestore_runtime_client=featurestore_runtime,
        )

        iam = boto3.client("iam")
        role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"]

        # you can also suffix the feature group name with pipeline git version
        self._feature_group = FeatureGroup(
            name=name, sagemaker_session=feature_store_session
        )
        self._description = description
        self._s3_uri = s3_uri
        self._role = role
        self._record_identifier_name = record_identifier_name
        self._event_time_name = event_time_name
        self._query = query

    def _wait_for_feature_group_creation_complete(self):
        status = self._feature_group.describe().get("FeatureGroupStatus")
        while status == "Creating":
            logger.info("Waiting for Feature Group Creation")
            time.sleep(5)
            status = self._feature_group.describe().get("FeatureGroupStatus")
        if status != "Created":
            raise RuntimeError(
                f"Failed to create feature group {self._feature_group.name}"
            )
        logger.info("FeatureGroup %s successfully created.", self._feature_group.name)

    def _describe(self):
        return dict(feature_group=self._feature_group)

    def _save(self, data):
        self._feature_group.load_feature_definitions(data)
        try:
            self._feature_group.create(
                description=self._description,
                s3_uri=self._s3_uri,
                record_identifier_name=self._record_identifier_name,
                event_time_feature_name=self._event_time_name,
                role_arn=self._role,
                enable_online_store=True,
            )

            self._wait_for_feature_group_creation_complete()
        except Exception as exc:
            if (
                f"Resource Already Exists: FeatureGroup with name {self._feature_group.name} already exists"
                in str(exc)
            ):
                pass
            else:
                raise

        self._feature_group.ingest(data[:10])  # just for demo purpose

    def _load(self) -> pd.DataFrame:
        query = self._feature_group.athena_query()
        print(self._query.format(table_name=query.table_name))
        query.run(
            self._query.format(table_name=query.table_name),
            output_location=f"{self._s3_uri}/query_results/",
        )
        query.wait()
        return query.as_dataframe()