Beispiel #1
0
    def test_from_csv_staging_location_not_specified(self):
        with pytest.raises(ValueError,
                           match="Specify staging_location for importing local file/dataframe"):
            feature_columns = ["avg_distance_completed",
                               "avg_customer_distance_completed"]
            csv_path = "tests/data/driver_features.csv"
            Importer.from_csv(path=csv_path,
                              entity="driver",
                              granularity=Granularity.DAY,
                              owner="*****@*****.**",
                              feature_columns=feature_columns,
                              timestamp_column="ts")

        with pytest.raises(ValueError,
                           match="Staging location must be in GCS") as e_info:
            feature_columns = ["avg_distance_completed",
                               "avg_customer_distance_completed"]
            csv_path = "tests/data/driver_features.csv"
            Importer.from_csv(path=csv_path,
                              entity="driver",
                              granularity=Granularity.DAY,
                              owner="*****@*****.**",
                              staging_location="/home",
                              feature_columns=feature_columns,
                              timestamp_column="ts")
Beispiel #2
0
 def test_from_csv_id_column_not_specified(self):
     with pytest.raises(ValueError,
                        match="Column with name driver is not found"):
         feature_columns = ["avg_distance_completed",
                            "avg_customer_distance_completed"]
         csv_path = "tests/data/driver_features.csv"
         Importer.from_csv(path=csv_path,
                           entity="driver",
                           granularity=Granularity.DAY,
                           owner="*****@*****.**",
                           staging_location="gs://test-bucket",
                           feature_columns=feature_columns,
                           timestamp_column="ts")
Beispiel #3
0
 def test_from_csv_staging_location_not_valid(self):
     with pytest.raises(ValueError,
                        match="Staging location must be in GCS") as e_info:
         feature_columns = [
             "avg_distance_completed", "avg_customer_distance_completed"
         ]
         csv_path = "tests/data/driver_features.csv"
         Importer.from_csv(path=csv_path,
                           entity="driver",
                           owner="*****@*****.**",
                           staging_location="/home",
                           feature_columns=feature_columns,
                           timestamp_column="ts")
Beispiel #4
0
    def test_from_csv_timestamp_column_not_specified(self):
        feature_columns = [
            "avg_distance_completed", "avg_customer_distance_completed",
            "avg_distance_cancelled"
        ]
        csv_path = "tests/data/driver_features.csv"
        entity_name = "driver"
        granularity = Granularity.DAY
        owner = "*****@*****.**"
        staging_location = "gs://test-bucket"
        id_column = "driver_id"
        importer = Importer.from_csv(path=csv_path,
                                     entity=entity_name,
                                     granularity=granularity,
                                     owner=owner,
                                     staging_location=staging_location,
                                     id_column=id_column,
                                     feature_columns=feature_columns)

        self._validate_csv_importer(importer,
                                    csv_path,
                                    entity_name,
                                    granularity,
                                    owner,
                                    staging_location=staging_location,
                                    id_column=id_column,
                                    feature_columns=feature_columns)
Beispiel #5
0
    def test_run_job_no_staging(self, client, mocker):
        grpc_stub = jobs.JobServiceStub(grpc.insecure_channel(""))

        mocker.patch.object(
            grpc_stub,
            'SubmitJob',
            return_value=JobServiceTypes.SubmitImportJobResponse(
                jobId="myjob12312"))
        client._job_service_stub = grpc_stub
        importer = Importer({"import": ImportSpec()}, None,
                            {"require_staging": False})

        job_id = client.run(importer)
        assert job_id == "myjob12312"
Beispiel #6
0
 def test_stage_df_without_timestamp(self, mocker):
     mocker.patch("feast.sdk.importer.df_to_gcs", return_value=True)
     feature_columns = [
         "avg_distance_completed", "avg_customer_distance_completed",
         "avg_distance_cancelled"
     ]
     csv_path = "tests/data/driver_features.csv"
     entity_name = "driver"
     owner = "*****@*****.**"
     staging_location = "gs://test-bucket"
     id_column = "driver_id"
     importer = Importer.from_csv(path=csv_path,
                                  entity=entity_name,
                                  owner=owner,
                                  staging_location=staging_location,
                                  id_column=id_column,
                                  feature_columns=feature_columns)
     importer.stage(None)
Beispiel #7
0
    def test_from_csv_feature_columns_not_specified(self):
        csv_path = "tests/data/driver_features.csv"
        entity_name = "driver"
        granularity = Granularity.DAY
        owner = "*****@*****.**"
        staging_location = "gs://test-bucket"
        id_column = "driver_id"
        timestamp_column = "ts"
        importer = Importer.from_csv(path=csv_path,
                                     entity=entity_name,
                                     granularity=granularity,
                                     owner=owner,
                                     staging_location=staging_location,
                                     id_column=id_column,
                                     timestamp_column=timestamp_column)

        self._validate_csv_importer(importer, csv_path, entity_name,
                                    granularity, owner,
                                    staging_location=staging_location,
                                    id_column=id_column,
                                    timestamp_column=timestamp_column)
Beispiel #8
0
    def test_from_df(self):
        csv_path = "tests/data/driver_features.csv"
        df = pd.read_csv(csv_path)
        staging_location = "gs://test-bucket"
        entity = "driver"

        importer = Importer.from_df(df=df,
                                    entity=entity,
                                    granularity=Granularity.DAY,
                                    owner="*****@*****.**",
                                    staging_location=staging_location,
                                    id_column="driver_id",
                                    timestamp_column="ts")

        assert importer.require_staging == True
        assert ("{}/tmp_{}".format(staging_location, entity)
                in importer.remote_path)
        for feature in importer.features.values():
            assert feature.name in df.columns
            assert feature.id == "driver.day." + feature.name

        import_spec = importer.spec
        assert import_spec.type == "file"
        assert import_spec.options == {
            "format": "csv",
            "path": importer.remote_path
        }
        assert import_spec.entities == ["driver"]

        schema = import_spec.schema
        assert schema.entityIdColumn == "driver_id"
        assert schema.timestampValue is not None
        feature_columns = [
            "completed", "avg_distance_completed",
            "avg_customer_distance_completed", "avg_distance_cancelled"
        ]
        for col, field in zip(df.columns.values, schema.fields):
            assert col == field.name
            if col in feature_columns:
                assert field.featureId == "driver.day." + col
Beispiel #9
0
    def test_from_csv(self):
        csv_path = "tests/data/driver_features.csv"
        entity_name = "driver"
        owner = "*****@*****.**"
        staging_location = "gs://test-bucket"
        id_column = "driver_id"
        feature_columns = [
            "avg_distance_completed", "avg_customer_distance_completed"
        ]
        timestamp_column = "ts"

        importer = Importer.from_csv(path=csv_path,
                                     entity=entity_name,
                                     owner=owner,
                                     staging_location=staging_location,
                                     id_column=id_column,
                                     feature_columns=feature_columns,
                                     timestamp_column=timestamp_column)

        self._validate_csv_importer(importer, csv_path, entity_name, owner,
                                    staging_location, id_column,
                                    feature_columns, timestamp_column)