def test_from_csv_staging_location_not_specified(self): with pytest.raises(ValueError, match="Specify staging_location for importing local file/dataframe"): feature_columns = ["avg_distance_completed", "avg_customer_distance_completed"] csv_path = "tests/data/driver_features.csv" Importer.from_csv(path=csv_path, entity="driver", granularity=Granularity.DAY, owner="*****@*****.**", feature_columns=feature_columns, timestamp_column="ts") with pytest.raises(ValueError, match="Staging location must be in GCS") as e_info: feature_columns = ["avg_distance_completed", "avg_customer_distance_completed"] csv_path = "tests/data/driver_features.csv" Importer.from_csv(path=csv_path, entity="driver", granularity=Granularity.DAY, owner="*****@*****.**", staging_location="/home", feature_columns=feature_columns, timestamp_column="ts")
def test_from_csv_id_column_not_specified(self): with pytest.raises(ValueError, match="Column with name driver is not found"): feature_columns = ["avg_distance_completed", "avg_customer_distance_completed"] csv_path = "tests/data/driver_features.csv" Importer.from_csv(path=csv_path, entity="driver", granularity=Granularity.DAY, owner="*****@*****.**", staging_location="gs://test-bucket", feature_columns=feature_columns, timestamp_column="ts")
def test_from_csv_staging_location_not_valid(self): with pytest.raises(ValueError, match="Staging location must be in GCS") as e_info: feature_columns = [ "avg_distance_completed", "avg_customer_distance_completed" ] csv_path = "tests/data/driver_features.csv" Importer.from_csv(path=csv_path, entity="driver", owner="*****@*****.**", staging_location="/home", feature_columns=feature_columns, timestamp_column="ts")
def test_from_csv_timestamp_column_not_specified(self): feature_columns = [ "avg_distance_completed", "avg_customer_distance_completed", "avg_distance_cancelled" ] csv_path = "tests/data/driver_features.csv" entity_name = "driver" granularity = Granularity.DAY owner = "*****@*****.**" staging_location = "gs://test-bucket" id_column = "driver_id" importer = Importer.from_csv(path=csv_path, entity=entity_name, granularity=granularity, owner=owner, staging_location=staging_location, id_column=id_column, feature_columns=feature_columns) self._validate_csv_importer(importer, csv_path, entity_name, granularity, owner, staging_location=staging_location, id_column=id_column, feature_columns=feature_columns)
def test_run_job_no_staging(self, client, mocker): grpc_stub = jobs.JobServiceStub(grpc.insecure_channel("")) mocker.patch.object( grpc_stub, 'SubmitJob', return_value=JobServiceTypes.SubmitImportJobResponse( jobId="myjob12312")) client._job_service_stub = grpc_stub importer = Importer({"import": ImportSpec()}, None, {"require_staging": False}) job_id = client.run(importer) assert job_id == "myjob12312"
def test_stage_df_without_timestamp(self, mocker): mocker.patch("feast.sdk.importer.df_to_gcs", return_value=True) feature_columns = [ "avg_distance_completed", "avg_customer_distance_completed", "avg_distance_cancelled" ] csv_path = "tests/data/driver_features.csv" entity_name = "driver" owner = "*****@*****.**" staging_location = "gs://test-bucket" id_column = "driver_id" importer = Importer.from_csv(path=csv_path, entity=entity_name, owner=owner, staging_location=staging_location, id_column=id_column, feature_columns=feature_columns) importer.stage(None)
def test_from_csv_feature_columns_not_specified(self): csv_path = "tests/data/driver_features.csv" entity_name = "driver" granularity = Granularity.DAY owner = "*****@*****.**" staging_location = "gs://test-bucket" id_column = "driver_id" timestamp_column = "ts" importer = Importer.from_csv(path=csv_path, entity=entity_name, granularity=granularity, owner=owner, staging_location=staging_location, id_column=id_column, timestamp_column=timestamp_column) self._validate_csv_importer(importer, csv_path, entity_name, granularity, owner, staging_location=staging_location, id_column=id_column, timestamp_column=timestamp_column)
def test_from_df(self): csv_path = "tests/data/driver_features.csv" df = pd.read_csv(csv_path) staging_location = "gs://test-bucket" entity = "driver" importer = Importer.from_df(df=df, entity=entity, granularity=Granularity.DAY, owner="*****@*****.**", staging_location=staging_location, id_column="driver_id", timestamp_column="ts") assert importer.require_staging == True assert ("{}/tmp_{}".format(staging_location, entity) in importer.remote_path) for feature in importer.features.values(): assert feature.name in df.columns assert feature.id == "driver.day." + feature.name import_spec = importer.spec assert import_spec.type == "file" assert import_spec.options == { "format": "csv", "path": importer.remote_path } assert import_spec.entities == ["driver"] schema = import_spec.schema assert schema.entityIdColumn == "driver_id" assert schema.timestampValue is not None feature_columns = [ "completed", "avg_distance_completed", "avg_customer_distance_completed", "avg_distance_cancelled" ] for col, field in zip(df.columns.values, schema.fields): assert col == field.name if col in feature_columns: assert field.featureId == "driver.day." + col
def test_from_csv(self): csv_path = "tests/data/driver_features.csv" entity_name = "driver" owner = "*****@*****.**" staging_location = "gs://test-bucket" id_column = "driver_id" feature_columns = [ "avg_distance_completed", "avg_customer_distance_completed" ] timestamp_column = "ts" importer = Importer.from_csv(path=csv_path, entity=entity_name, owner=owner, staging_location=staging_location, id_column=id_column, feature_columns=feature_columns, timestamp_column=timestamp_column) self._validate_csv_importer(importer, csv_path, entity_name, owner, staging_location, id_column, feature_columns, timestamp_column)