Example #1
0
def save_to_feature_store():
    logger.info("Save to FeatureStore started")
    global feature_group

    df_data = pd.read_csv(feature_s3_url)
    logger.info("Read data from S3: %s", df_data.head())

    feature_store_session = Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_featurestore_runtime_client=featurestore_runtime)
    # You can modify the following to use a bucket of your choosing
    logger.info("Default bucket: %s", default_bucket)

    # record identifier and event time feature names
    record_identifier_feature_name = "IDpol"
    event_time_feature_name = "EventTime"
    current_time_sec = int(round(time.time()))
    # cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
    cast_object_to_string(df_data)
    df_data[event_time_feature_name] = pd.Series([current_time_sec] *
                                                 len(df_data),
                                                 dtype="float64")

    feature_group_name = 'insurance-policy-feature-group-' + strftime(
        '%d-%H-%M-%S', gmtime())
    logger.info("Feature Group Name: %s", feature_group_name)

    # Check if feature group already exists. Create a feature group if doesn't exist.
    if feature_group_exist(feature_group_name) == False:
        logger.info("Feature Group: %s doesn't exist. Create a new one.",
                    feature_group)

        feature_group = FeatureGroup(name=feature_group_name,
                                     sagemaker_session=feature_store_session)

        # append EventTime feature
        # load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
        feature_group.load_feature_definitions(data_frame=df_data)
        # output is suppressed
        feature_group.create(
            s3_uri=f"s3://{default_bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=get_execution_role(),
            enable_online_store=True)

        wait_for_feature_group_creation_complete(feature_group=feature_group)
        feature_group.describe()
    else:
        logger.info("Feature Group: %s exits", feature_group)
        # Init feature group object if already exists
        feature_group = FeatureGroup(name=feature_group_name,
                                     sagemaker_session=feature_store_session)

    # ingest data into feature store
    feature_group.ingest(data_frame=df_data, max_workers=5, wait=True)
Example #2
0
def _wait_for_feature_group_create(feature_group: FeatureGroup):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        print(feature_group.describe())
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")
Example #3
0
def create_or_load_feature_group(prefix, feature_group_name):

    # Feature Definitions for our records
    feature_definitions = [
        FeatureDefinition(feature_name="input_ids", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="input_mask", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="segment_ids", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="label_id", feature_type=FeatureTypeEnum.INTEGRAL),
        FeatureDefinition(feature_name="review_id", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="date", feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="label", feature_type=FeatureTypeEnum.INTEGRAL),
        #        FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name="split_type", feature_type=FeatureTypeEnum.STRING),
    ]

    feature_group = FeatureGroup(
        name=feature_group_name, feature_definitions=feature_definitions, sagemaker_session=sagemaker_session
    )

    print("Feature Group: {}".format(feature_group))

    try:
        print(
            "Waiting for existing Feature Group to become available if it is being created by another instance in our cluster..."
        )
        wait_for_feature_group_creation_complete(feature_group)
    except Exception as e:
        print("Before CREATE FG wait exeption: {}".format(e))
    #        pass

    try:
        record_identifier_feature_name = "review_id"
        event_time_feature_name = "date"

        print("Creating Feature Group with role {}...".format(role))
        feature_group.create(
            s3_uri=f"s3://{bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=role,
            enable_online_store=True,
        )
        print("Creating Feature Group. Completed.")

        print("Waiting for new Feature Group to become available...")
        wait_for_feature_group_creation_complete(feature_group)
        print("Feature Group available.")
        feature_group.describe()

    except Exception as e:
        print("Exception: {}".format(e))

    return feature_group
Example #4
0
def test_feature_store_describe(sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyFeatureGroup",
                                 sagemaker_session=sagemaker_session_mock)
    feature_group.describe()
    sagemaker_session_mock.describe_feature_group.assert_called_with(
        feature_group_name="MyFeatureGroup", next_token=None)
def test_create_feature_store(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
    record,
    create_table_ddl,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        resolved_output_s3_uri = (
            feature_group.describe().get("OfflineStoreConfig").get(
                "S3StorageConfig").get("ResolvedOutputS3Uri"))
        # Ingest data
        feature_group.put_record(record=record)
        ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame,
                                                 max_workers=3,
                                                 wait=False)
        ingestion_manager.wait()
        assert 0 == len(ingestion_manager.failed_rows)

        # Query the integrated Glue table.
        athena_query = feature_group.athena_query()
        df = DataFrame()
        with timeout(minutes=10):
            while df.shape[0] < 11:
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")
                df = athena_query.as_dataframe()
                print(f"Found {df.shape[0]} records.")
                time.sleep(60)

        assert df.shape[0] == 11
        nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8,
                                                   9])]["feature4"])
        for is_na in nans.items():
            assert is_na
        assert (create_table_ddl.format(
            feature_group_name=feature_group_name,
            region=feature_store_session.boto_session.region_name,
            account=feature_store_session.account_id(),
            resolved_output_s3_uri=resolved_output_s3_uri,
        ) == feature_group.as_hive_ddl())
    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
Example #6
0
    # Create feature group
    feature_group.create(
        s3_uri='s3://{}/{}'.format(bucket, prefix),
        record_identifier_name=record_identifier_feature_name,
        event_time_feature_name=event_time_feature_name,
        role_arn=role,
        enable_online_store=True,
        description="1.8M+ tokenized camera reviews from the Amazon Customer Reviews dataset",
        tags=[
            { 'Key': 'Dataset', 'Value': 'amazon customer reviews' },
            { 'Key': 'Subset', 'Value': 'cameras' },
            { 'Key': 'Owner', 'Value': 'Julien Simon' }
        ]
    )
    # Wait for feature group to be ready
    while feature_group.describe().get("FeatureGroupStatus") != 'Created':
        sleep(1)
    print('Feature group created')
    
    # Ingest data
    print('Ingesting data...')
    try:
        feature_group.ingest(data_frame=data, max_workers=max_workers, wait=True)
    except Exception:
        pass
    
    print('Waiting...')
    # Wait for 10 minutes to make sure data has flowed to the offline store
    # https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-offline.html
    sleep(600)
Example #7
0
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime)

feature_group_names = [
    args.feature_group_name_ratings, args.feature_group_name_tracks,
    args.feature_group_name_user_preferences
]
feature_groups = []
for name in feature_group_names:
    feature_group = FeatureGroup(name=name,
                                 sagemaker_session=feature_store_session)
    feature_groups.append(feature_group)

feature_group_s3_prefixes = []
for feature_group in feature_groups:
    feature_group_table_name = feature_group.describe().get(
        "OfflineStoreConfig").get("DataCatalogConfig").get("TableName")
    feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}'
    feature_group_s3_prefixes.append(feature_group_s3_prefix)


# wait for data to be added to offline feature store
def wait_for_offline_store(feature_group_s3_prefix):
    print(feature_group_s3_prefix)
    offline_store_contents = None
    while (offline_store_contents is None):
        objects_in_bucket = s3_client.list_objects(
            Bucket=bucket, Prefix=feature_group_s3_prefix)
        if ('Contents' in objects_in_bucket
                and len(objects_in_bucket['Contents']) > 1):
            offline_store_contents = objects_in_bucket['Contents']
        else:
def create_or_load_feature_group(prefix, feature_group_name):

    # Feature Definitions for our records
    feature_definitions = [
        FeatureDefinition(feature_name='input_ids',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='input_mask',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='segment_ids',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='label_id',
                          feature_type=FeatureTypeEnum.INTEGRAL),
        FeatureDefinition(feature_name='review_id',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='date',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='label',
                          feature_type=FeatureTypeEnum.INTEGRAL),
        #        FeatureDefinition(feature_name='review_body', feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='split_type',
                          feature_type=FeatureTypeEnum.STRING)
    ]

    feature_group = FeatureGroup(name=feature_group_name,
                                 feature_definitions=feature_definitions,
                                 sagemaker_session=sagemaker_session)

    print('Feature Group: {}'.format(feature_group))

    try:
        print(
            'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...'
        )
        wait_for_feature_group_creation_complete(feature_group)
    except Exception as e:
        print('Before CREATE FG wait exeption: {}'.format(e))
#        pass

    try:
        record_identifier_feature_name = "review_id"
        event_time_feature_name = "date"

        print('Creating Feature Group with role {}...'.format(role))
        feature_group.create(
            s3_uri=f"s3://{bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=role,
            enable_online_store=True)
        print('Creating Feature Group. Completed.')

        print('Waiting for new Feature Group to become available...')
        wait_for_feature_group_creation_complete(feature_group)
        print('Feature Group available.')
        feature_group.describe()

    except Exception as e:
        print('Exception: {}'.format(e))


#        pass

#         print('FAILED - NOW Creating Feature Group with service-role {}...'.format('arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole'))
#         feature_group.create(
#             s3_uri=f"s3://{bucket}/{prefix}",
#             record_identifier_name=record_identifier_feature_name,
#             event_time_feature_name=event_time_feature_name,
#             role_arn='arn:aws:iam::231218423789:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole',
#             enable_online_store=True
#         )
#         print('Creating Feature Group. Completed.')

#    feature_group.describe()

    return feature_group
Example #9
0
def create_or_load_feature_group(prefix, feature_group_name):

    # Feature Definitions for the records
    feature_definitions = [
        FeatureDefinition(feature_name='review_id',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='date',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='sentiment',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='label_id',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='input_ids',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='review_body',
                          feature_type=FeatureTypeEnum.STRING),
        FeatureDefinition(feature_name='split_type',
                          feature_type=FeatureTypeEnum.STRING)
    ]

    # setup the Feature Group
    feature_group = FeatureGroup(name=feature_group_name,
                                 feature_definitions=feature_definitions,
                                 sagemaker_session=sagemaker_session)

    print('Feature Group: {}'.format(feature_group))

    try:
        print(
            'Waiting for existing Feature Group to become available if it is being created by another instance in our cluster...'
        )
        wait_for_feature_group_creation_complete(feature_group)
    except Exception as e:
        print('Before CREATE FG wait exeption: {}'.format(e))

    try:
        record_identifier_feature_name = "review_id"
        event_time_feature_name = "date"

        print('Creating Feature Group with role {}...'.format(role))

        # create Feature Group
        feature_group.create(
            s3_uri=f"s3://{bucket}/{prefix}",
            record_identifier_name=record_identifier_feature_name,
            event_time_feature_name=event_time_feature_name,
            role_arn=role,
            enable_online_store=False)
        print('Creating Feature Group. Completed.')

        print('Waiting for new Feature Group to become available...')
        wait_for_feature_group_creation_complete(feature_group)
        print('Feature Group available.')

        # the information about the Feature Group
        feature_group.describe()

    except Exception as e:
        print('Exception: {}'.format(e))

    return feature_group
def test_feature_store_describe(sagemaker_session_mock):
    feature_group = FeatureGroup(name="MyFeatureGroup",
                                 sagemaker_session=sagemaker_session_mock)
    feature_group.describe()
    assert sagemaker_session_mock.describe_feature_group.called_with(
        feature_group_name="MyFeatureGroup")
Example #11
0
class FeatureGroupDataSet(AbstractDataSet):
    def __init__(
        self,
        name: str,
        s3_uri: str,
        record_identifier_name: str,
        event_time_name: str,
        query: str,
        description: str = None,
    ):

        region = boto3.Session().region_name
        boto_session = boto3.Session(region_name=region)

        sagemaker_client = boto_session.client(
            service_name="sagemaker", region_name=region
        )
        featurestore_runtime = boto_session.client(
            service_name="sagemaker-featurestore-runtime", region_name=region
        )

        feature_store_session = Session(
            boto_session=boto_session,
            sagemaker_client=sagemaker_client,
            sagemaker_featurestore_runtime_client=featurestore_runtime,
        )

        iam = boto3.client("iam")
        role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"]

        # you can also suffix the feature group name with pipeline git version
        self._feature_group = FeatureGroup(
            name=name, sagemaker_session=feature_store_session
        )
        self._description = description
        self._s3_uri = s3_uri
        self._role = role
        self._record_identifier_name = record_identifier_name
        self._event_time_name = event_time_name
        self._query = query

    def _wait_for_feature_group_creation_complete(self):
        status = self._feature_group.describe().get("FeatureGroupStatus")
        while status == "Creating":
            logger.info("Waiting for Feature Group Creation")
            time.sleep(5)
            status = self._feature_group.describe().get("FeatureGroupStatus")
        if status != "Created":
            raise RuntimeError(
                f"Failed to create feature group {self._feature_group.name}"
            )
        logger.info("FeatureGroup %s successfully created.", self._feature_group.name)

    def _describe(self):
        return dict(feature_group=self._feature_group)

    def _save(self, data):
        self._feature_group.load_feature_definitions(data)
        try:
            self._feature_group.create(
                description=self._description,
                s3_uri=self._s3_uri,
                record_identifier_name=self._record_identifier_name,
                event_time_feature_name=self._event_time_name,
                role_arn=self._role,
                enable_online_store=True,
            )

            self._wait_for_feature_group_creation_complete()
        except Exception as exc:
            if (
                f"Resource Already Exists: FeatureGroup with name {self._feature_group.name} already exists"
                in str(exc)
            ):
                pass
            else:
                raise

        self._feature_group.ingest(data[:10])  # just for demo purpose

    def _load(self) -> pd.DataFrame:
        query = self._feature_group.athena_query()
        print(self._query.format(table_name=query.table_name))
        query.run(
            self._query.format(table_name=query.table_name),
            output_location=f"{self._s3_uri}/query_results/",
        )
        query.wait()
        return query.as_dataframe()
args = parser.parse_args()

region = args.region
boto3.setup_default_session(region_name=region)
s3_client = boto3.client("s3")
account_id = boto3.client("sts").get_caller_identity()["Account"]
now = pd.to_datetime("now")

feature_store_session = sagemaker.Session()
claims_feature_group = FeatureGroup(name=args.claims_feature_group_name, sagemaker_session=feature_store_session)
customers_feature_group = FeatureGroup(
    name=args.customers_feature_group_name, sagemaker_session=feature_store_session
)

claims_table_name = (
    claims_feature_group.describe()["OfflineStoreConfig"]["DataCatalogConfig"]["TableName"]
)
customers_table_name = (
    customers_feature_group.describe()["OfflineStoreConfig"]["DataCatalogConfig"]["TableName"]
)
athena_database_name = customers_feature_group.describe()["OfflineStoreConfig"]["DataCatalogConfig"]["Database"]

print(f'claims_table_name: {claims_table_name}')
print(f'customers_table_name: {customers_table_name}')

claims_feature_group_s3_prefix = f'{args.bucket_prefix}/{account_id}/sagemaker/{region}/offline-store/{claims_table_name}/data/year={now.year}/month={now.strftime("%m")}/day={now.strftime("%d")}'
customers_feature_group_s3_prefix = f'{args.bucket_prefix}/{account_id}/sagemaker/{region}/offline-store/{customers_table_name}/data/year={now.year}/month={now.strftime("%m")}/day={now.strftime("%d")}'

print(f'claims_feature_group_s3_prefix: {claims_feature_group_s3_prefix}')
print(f'customers_feature_group_s3_prefix: {customers_feature_group_s3_prefix}')