コード例 #1
0
def test_ingest_multi_process(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        feature_group.ingest(data_frame=pandas_data_frame,
                             max_workers=3,
                             max_processes=2,
                             wait=True)

    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
コード例 #2
0
def test_ingest_with_profile_name(ingestion_manager_init,
                                  sagemaker_session_mock,
                                  fs_runtime_client_config_mock):
    sagemaker_session_mock.sagemaker_featurestore_runtime_client.meta.config = (
        fs_runtime_client_config_mock)

    feature_group = FeatureGroup(name="MyGroup",
                                 sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame(
        dict((f"float{i}", pd.Series([2.0], dtype="float64"))
             for i in range(300)))

    mock_ingestion_manager_instance = Mock()
    ingestion_manager_init.return_value = mock_ingestion_manager_instance
    feature_group.ingest(data_frame=df,
                         max_workers=10,
                         profile_name="profile_name")

    ingestion_manager_init.assert_called_once_with(
        feature_group_name="MyGroup",
        sagemaker_fs_runtime_client_config=fs_runtime_client_config_mock,
        max_workers=10,
        max_processes=1,
        profile_name="profile_name",
    )
    mock_ingestion_manager_instance.run.assert_called_once_with(data_frame=df,
                                                                wait=True,
                                                                timeout=None)
コード例 #3
0
def test_ingest_zero_workers():
    feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
    df = Mock()
    with pytest.raises(RuntimeError) as error:
        feature_group.ingest(data_frame=df, max_workers=0, max_processes=1)

    assert "max_workers must be greater than 0." in str(error)
コード例 #4
0
def save_to_feature_store():
    logger.info("Save to FeatureStore started")
    global feature_group

    df_data = pd.read_csv(feature_s3_url)
    logger.info("Read data from S3: %s", df_data.head())

    feature_store_session = Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_featurestore_runtime_client=featurestore_runtime)
    # You can modify the following to use a bucket of your choosing
    logger.info("Default bucket: %s", default_bucket)

    # record identifier and event time feature names
    record_identifier_feature_name = "IDpol"
    event_time_feature_name = "EventTime"
    current_time_sec = int(round(time.time()))
    # cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
    cast_object_to_string(df_data)
    df_data[event_time_feature_name] = pd.Series([current_time_sec] *
                                                 len(df_data),
                                                 dtype="float64")

    feature_group_name = 'insurance-policy-feature-group-' + strftime(
        '%d-%H-%M-%S', gmtime())
    logger.info("Feature Group Name: %s", feature_group_name)

    # Check if feature group already exists. Create a feature group if doesn't exist.
    if feature_group_exist(feature_group_name) == False:
        logger.info("Feature Group: %s doesn't exist. Create a new one.",
                    feature_group)

        feature_group = FeatureGroup(name=feature_group_name,
                                     sagemaker_session=feature_store_session)

        # append EventTime feature
        # load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
        feature_group.load_feature_definitions(data_frame=df_data)
        # output is suppressed
        feature_group.create(
            s3_uri=f"s3://{default_bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=get_execution_role(),
            enable_online_store=True)

        wait_for_feature_group_creation_complete(feature_group=feature_group)
        feature_group.describe()
    else:
        logger.info("Feature Group: %s exits", feature_group)
        # Init feature group object if already exists
        feature_group = FeatureGroup(name=feature_group_name,
                                     sagemaker_session=feature_store_session)

    # ingest data into feature store
    feature_group.ingest(data_frame=df_data, max_workers=5, wait=True)
コード例 #5
0
def test_ingest_default_max_workers(ingestion_manager_init, sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame({"float": pd.Series([2.0], dtype="float64")})

    mock_ingestion_manager_instance = Mock()
    ingestion_manager_init.return_value = mock_ingestion_manager_instance
    feature_group.ingest(data_frame=df)

    ingestion_manager_init.assert_called_once_with(
        feature_group_name="MyGroup",
        sagemaker_session=sagemaker_session_mock,
        data_frame=df,
        max_workers=1,
    )
    mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
コード例 #6
0
def test_ingest(ingestion_manager_init, sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyGroup", sagemaker_session=sagemaker_session_mock)
    df = pd.DataFrame(dict((f"float{i}", pd.Series([2.0], dtype="float64")) for i in range(300)))

    mock_ingestion_manager_instance = Mock()
    ingestion_manager_init.return_value = mock_ingestion_manager_instance
    feature_group.ingest(data_frame=df, max_workers=10)

    ingestion_manager_init.assert_called_once_with(
        feature_group_name="MyGroup",
        sagemaker_session=sagemaker_session_mock,
        data_frame=df,
        max_workers=10,
    )
    mock_ingestion_manager_instance.run.assert_called_once_with(wait=True, timeout=None)
コード例 #7
0
def test_ingest_without_string_feature(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame_without_string,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(
        data_frame=pandas_data_frame_without_string)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature2",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        ingestion_manager = feature_group.ingest(
            data_frame=pandas_data_frame_without_string,
            max_workers=3,
            wait=False)
        ingestion_manager.wait()

    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
コード例 #8
0
def test_create_feature_store(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
    record,
    create_table_ddl,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        # Ingest data
        feature_group.put_record(record=record)
        ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame,
                                                 max_workers=3,
                                                 wait=False)
        ingestion_manager.wait()
        assert 0 == len(ingestion_manager.failed_rows)

        # Query the integrated Glue table.
        athena_query = feature_group.athena_query()
        df = DataFrame()
        with timeout(minutes=10):
            while df.shape[0] < 11:
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")
                df = athena_query.as_dataframe()
                print(f"Found {df.shape[0]} records.")
                time.sleep(60)

        assert df.shape[0] == 11
        nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8,
                                                   9])]["feature4"])
        for is_na in nans.items():
            assert is_na
        assert (create_table_ddl.format(
            feature_group_name=feature_group_name,
            region=feature_store_session.boto_session.region_name,
            account=feature_store_session.account_id(),
        ) == feature_group.as_hive_ddl())
    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
コード例 #9
0
        role_arn=role,
        enable_online_store=True,
        description="1.8M+ tokenized camera reviews from the Amazon Customer Reviews dataset",
        tags=[
            { 'Key': 'Dataset', 'Value': 'amazon customer reviews' },
            { 'Key': 'Subset', 'Value': 'cameras' },
            { 'Key': 'Owner', 'Value': 'Julien Simon' }
        ]
    )
    # Wait for feature group to be ready
    while feature_group.describe().get("FeatureGroupStatus") != 'Created':
        sleep(1)
    print('Feature group created')
    
    # Ingest data
    print('Ingesting data...')
    try:
        feature_group.ingest(data_frame=data, max_workers=max_workers, wait=True)
    except Exception:
        pass
    
    print('Waiting...')
    # Wait for 10 minutes to make sure data has flowed to the offline store
    # https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-offline.html
    sleep(600)

    # Save feature group name
    with open('/opt/ml/processing/output/feature_group_name.txt', 'w') as f:
        f.write(fg_name)
    
    print('Job complete')
コード例 #10
0
class FeatureGroupDataSet(AbstractDataSet):
    def __init__(
        self,
        name: str,
        s3_uri: str,
        record_identifier_name: str,
        event_time_name: str,
        query: str,
        description: str = None,
    ):

        region = boto3.Session().region_name
        boto_session = boto3.Session(region_name=region)

        sagemaker_client = boto_session.client(
            service_name="sagemaker", region_name=region
        )
        featurestore_runtime = boto_session.client(
            service_name="sagemaker-featurestore-runtime", region_name=region
        )

        feature_store_session = Session(
            boto_session=boto_session,
            sagemaker_client=sagemaker_client,
            sagemaker_featurestore_runtime_client=featurestore_runtime,
        )

        iam = boto3.client("iam")
        role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"]

        # you can also suffix the feature group name with pipeline git version
        self._feature_group = FeatureGroup(
            name=name, sagemaker_session=feature_store_session
        )
        self._description = description
        self._s3_uri = s3_uri
        self._role = role
        self._record_identifier_name = record_identifier_name
        self._event_time_name = event_time_name
        self._query = query

    def _wait_for_feature_group_creation_complete(self):
        status = self._feature_group.describe().get("FeatureGroupStatus")
        while status == "Creating":
            logger.info("Waiting for Feature Group Creation")
            time.sleep(5)
            status = self._feature_group.describe().get("FeatureGroupStatus")
        if status != "Created":
            raise RuntimeError(
                f"Failed to create feature group {self._feature_group.name}"
            )
        logger.info("FeatureGroup %s successfully created.", self._feature_group.name)

    def _describe(self):
        return dict(feature_group=self._feature_group)

    def _save(self, data):
        self._feature_group.load_feature_definitions(data)
        try:
            self._feature_group.create(
                description=self._description,
                s3_uri=self._s3_uri,
                record_identifier_name=self._record_identifier_name,
                event_time_feature_name=self._event_time_name,
                role_arn=self._role,
                enable_online_store=True,
            )

            self._wait_for_feature_group_creation_complete()
        except Exception as exc:
            if (
                f"Resource Already Exists: FeatureGroup with name {self._feature_group.name} already exists"
                in str(exc)
            ):
                pass
            else:
                raise

        self._feature_group.ingest(data[:10])  # just for demo purpose

    def _load(self) -> pd.DataFrame:
        query = self._feature_group.athena_query()
        print(self._query.format(table_name=query.table_name))
        query.run(
            self._query.format(table_name=query.table_name),
            output_location=f"{self._s3_uri}/query_results/",
        )
        query.wait()
        return query.as_dataframe()