def test_create_feature_store(
    feature_store_session,
    role,
    feature_group_name,
    offline_store_s3_uri,
    pandas_data_frame,
    record,
    create_table_ddl,
):
    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=feature_store_session)
    feature_group.load_feature_definitions(data_frame=pandas_data_frame)

    with cleanup_feature_group(feature_group):
        output = feature_group.create(
            s3_uri=offline_store_s3_uri,
            record_identifier_name="feature1",
            event_time_feature_name="feature3",
            role_arn=role,
            enable_online_store=True,
        )
        _wait_for_feature_group_create(feature_group)

        # Ingest data
        feature_group.put_record(record=record)
        ingestion_manager = feature_group.ingest(data_frame=pandas_data_frame,
                                                 max_workers=3,
                                                 wait=False)
        ingestion_manager.wait()
        assert 0 == len(ingestion_manager.failed_rows)

        # Query the integrated Glue table.
        athena_query = feature_group.athena_query()
        df = DataFrame()
        with timeout(minutes=10):
            while df.shape[0] < 11:
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")
                df = athena_query.as_dataframe()
                print(f"Found {df.shape[0]} records.")
                time.sleep(60)

        assert df.shape[0] == 11
        nans = pd.isna(df.loc[df["feature1"].isin([5, 6, 7, 8,
                                                   9])]["feature4"])
        for is_na in nans.items():
            assert is_na
        assert (create_table_ddl.format(
            feature_group_name=feature_group_name,
            region=feature_store_session.boto_session.region_name,
            account=feature_store_session.account_id(),
        ) == feature_group.as_hive_ddl())
    assert output["FeatureGroupArn"].endswith(
        f"feature-group/{feature_group_name}")
    sagemaker_client = boto_session.client(service_name='sagemaker')
    featurestore_client = boto_session.client(
        service_name='sagemaker-featurestore-runtime')
    session = sagemaker.session.Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_featurestore_runtime_client=featurestore_client)

    # Read feature group name
    with open('/opt/ml/processing/input/feature_group_name.txt') as f:
        feature_group_name = f.read()

    feature_group = FeatureGroup(name=feature_group_name,
                                 sagemaker_session=session)

    feature_group_query = feature_group.athena_query()
    feature_group_table = feature_group_query.table_name
    print(feature_group_table)

    query_string = 'SELECT label,review_body FROM "' \
        + feature_group_table+'"' \
        + ' INNER JOIN (SELECT product_id FROM (SELECT product_id, avg(star_rating) as avg_rating, count(*) as review_count \
            FROM "'                    + feature_group_table+'"' \
        + ' GROUP BY product_id) WHERE review_count > 1000) tmp ON "' \
        + feature_group_table+'"'+ '.product_id=tmp.product_id;'
    print(query_string)

    dataset = pd.DataFrame()
    feature_group_query.run(query_string=query_string,
                            output_location='s3://' + bucket +
                            '/query_results/')
Esempio n. 3
0
def test_one_step_ingestion_pipeline(sagemaker_session, feature_store_session,
                                     feature_definitions, role, pipeline_name):
    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
    instance_type = ParameterString(name="InstanceType",
                                    default_value="ml.m5.4xlarge")

    input_name = "features.csv"
    input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv")
    input_data_uri = os.path.join("s3://", sagemaker_session.default_bucket(),
                                  "py-sdk-ingestion-test-input/features.csv")
    with open(input_file_path, "r") as data:
        body = data.read()
        S3Uploader.upload_string_as_file_body(
            body=body,
            desired_s3_uri=input_data_uri,
            sagemaker_session=sagemaker_session)

    inputs = [
        ProcessingInput(
            input_name=input_name,
            source=input_data_uri,
            destination="/opt/ml/processing/features.csv",
        )
    ]

    feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}"
    feature_group = FeatureGroup(
        name=feature_group_name,
        feature_definitions=feature_definitions,
        sagemaker_session=feature_store_session,
    )

    ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input(
        input_name,
        input_data_uri,
        s3_content_type="csv",
        s3_has_header=True,
    )

    outputs = [
        ProcessingOutput(
            output_name=output_name,
            app_managed=True,
            feature_store_output=FeatureStoreOutput(
                feature_group_name=feature_group_name),
        )
    ]

    temp_flow_path = "./ingestion.flow"
    with cleanup_feature_group(feature_group):
        json.dump(ingestion_only_flow, open(temp_flow_path, "w"))

        data_wrangler_processor = DataWranglerProcessor(
            role=role,
            data_wrangler_flow_source=temp_flow_path,
            instance_count=instance_count,
            instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            max_runtime_in_seconds=86400,
        )

        data_wrangler_step = ProcessingStep(name="ingestion-step",
                                            processor=data_wrangler_processor,
                                            inputs=inputs,
                                            outputs=outputs)

        pipeline = Pipeline(
            name=pipeline_name,
            parameters=[instance_count, instance_type],
            steps=[data_wrangler_step],
            sagemaker_session=sagemaker_session,
        )

        try:
            response = pipeline.create(role)
            create_arn = response["PipelineArn"]

            offline_store_s3_uri = os.path.join(
                "s3://", sagemaker_session.default_bucket(),
                feature_group_name)
            feature_group.create(
                s3_uri=offline_store_s3_uri,
                record_identifier_name="f11",
                event_time_feature_name="f10",
                role_arn=role,
                enable_online_store=False,
            )
            _wait_for_feature_group_create(feature_group)

            execution = pipeline.start()
            response = execution.describe()
            assert response["PipelineArn"] == create_arn

            try:
                execution.wait(delay=60, max_attempts=10)
            except WaiterError:
                pass

            execution_steps = execution.list_steps()

            assert len(execution_steps) == 1
            assert execution_steps[0]["StepName"] == "ingestion-step"
            assert execution_steps[0]["StepStatus"] == "Succeeded"

            athena_query = feature_group.athena_query()
            with timeout(minutes=10):
                athena_query.run(
                    query_string=f'SELECT * FROM "{athena_query.table_name}"',
                    output_location=f"{offline_store_s3_uri}/query_results",
                )
                athena_query.wait()
                assert "SUCCEEDED" == athena_query.get_query_execution().get(
                    "QueryExecution").get("Status").get("State")

                df = athena_query.as_dataframe()
                assert pd.read_csv(input_file_path).shape[0] == df.shape[0]
        finally:
            try:
                pipeline.delete()
            except Exception as e:
                print(f"Delete pipeline failed with error: {e}")
            os.remove(temp_flow_path)
Esempio n. 4
0
class FeatureGroupDataSet(AbstractDataSet):
    def __init__(
        self,
        name: str,
        s3_uri: str,
        record_identifier_name: str,
        event_time_name: str,
        query: str,
        description: str = None,
    ):

        region = boto3.Session().region_name
        boto_session = boto3.Session(region_name=region)

        sagemaker_client = boto_session.client(
            service_name="sagemaker", region_name=region
        )
        featurestore_runtime = boto_session.client(
            service_name="sagemaker-featurestore-runtime", region_name=region
        )

        feature_store_session = Session(
            boto_session=boto_session,
            sagemaker_client=sagemaker_client,
            sagemaker_featurestore_runtime_client=featurestore_runtime,
        )

        iam = boto3.client("iam")
        role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole")["Role"]["Arn"]

        # you can also suffix the feature group name with pipeline git version
        self._feature_group = FeatureGroup(
            name=name, sagemaker_session=feature_store_session
        )
        self._description = description
        self._s3_uri = s3_uri
        self._role = role
        self._record_identifier_name = record_identifier_name
        self._event_time_name = event_time_name
        self._query = query

    def _wait_for_feature_group_creation_complete(self):
        status = self._feature_group.describe().get("FeatureGroupStatus")
        while status == "Creating":
            logger.info("Waiting for Feature Group Creation")
            time.sleep(5)
            status = self._feature_group.describe().get("FeatureGroupStatus")
        if status != "Created":
            raise RuntimeError(
                f"Failed to create feature group {self._feature_group.name}"
            )
        logger.info("FeatureGroup %s successfully created.", self._feature_group.name)

    def _describe(self):
        return dict(feature_group=self._feature_group)

    def _save(self, data):
        self._feature_group.load_feature_definitions(data)
        try:
            self._feature_group.create(
                description=self._description,
                s3_uri=self._s3_uri,
                record_identifier_name=self._record_identifier_name,
                event_time_feature_name=self._event_time_name,
                role_arn=self._role,
                enable_online_store=True,
            )

            self._wait_for_feature_group_creation_complete()
        except Exception as exc:
            if (
                f"Resource Already Exists: FeatureGroup with name {self._feature_group.name} already exists"
                in str(exc)
            ):
                pass
            else:
                raise

        self._feature_group.ingest(data[:10])  # just for demo purpose

    def _load(self) -> pd.DataFrame:
        query = self._feature_group.athena_query()
        print(self._query.format(table_name=query.table_name))
        query.run(
            self._query.format(table_name=query.table_name),
            output_location=f"{self._s3_uri}/query_results/",
        )
        query.wait()
        return query.as_dataframe()