def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str: # Run the Beam pipeline in Dataflow making sure GPUs are used. yield from utils.cloud_build_submit( config="run.yaml", substitutions={ "_JOB_NAME": utils.hyphen_name(NAME), "_IMAGE": f"{NAME}:{utils.uuid}", "_TEMP_LOCATION": f"gs://{bucket_name}/temp", "_REGION": utils.region, }, source="--no-source", )
def test_tensorflow_landsat( utils: Utils, bucket_name: str, run_dataflow_job: str ) -> None: # Wait until the job finishes. timeout = 30 * 60 # 30 minutes status = utils.dataflow_jobs_wait( job_name=utils.hyphen_name(NAME), timeout_sec=timeout ) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status" # Check that output files were created and are not empty. storage_client = storage.Client() print(f">> Checking for output files in: gs://{bucket_name}/outputs/") output_files = list(storage_client.list_blobs(bucket_name, prefix="outputs/")) assert len(output_files) > 0, f"No files found in gs://{bucket_name}/outputs/" for output_file in output_files: assert output_file.size > 0, f"Output file is empty: {output_file.name}"
def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool: yield from utils.pubsub_publisher( pubsub_topic, new_msg=lambda i: json.dumps( { "url": "https://beam.apache.org/", "review": "positive" if i % 2 == 0 else "negative", }), )
def test_flex_template_streaming_beam( utils: Utils, bucket_name: str, pubsub_publisher: str, pubsub_subscription: str, flex_template_path: str, bigquery_dataset: str, ) -> None: bigquery_table = "output_table" job_id = utils.dataflow_flex_template_run( job_name=NAME, template_path=flex_template_path, bucket_name=bucket_name, parameters={ "input_subscription": pubsub_subscription, "output_table": f"{bigquery_dataset}.{bigquery_table}", }, ) # Since this is a streaming job, it will never finish running. # First, lets wait until the job is running. utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING") # Then, wait 3 minutes for data to arrive, get processed, and cancel it. time.sleep(3 * 60) utils.dataflow_jobs_cancel(job_id, drain=True) # Check for the output data in BigQuery. query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}" rows = list(utils.bigquery_query(query)) assert len(rows) > 0 for row in rows: assert "score" in row
def dataflow_job_id( utils: Utils, bucket_name: str, flex_template_path: str, bigquery_dataset: str, pubsub_subscription: str, ) -> str: yield from utils.dataflow_flex_template_run( job_name=NAME, template_path=flex_template_path, bucket_name=bucket_name, parameters={ "input_subscription": pubsub_subscription, "output_table": f"{bigquery_dataset}.{BIGQUERY_TABLE}", }, )
def flex_template_path(utils: Utils, bucket_name: str, flex_template_image: str) -> str: yield from utils.dataflow_flex_template_build(bucket_name, flex_template_image)
def flex_template_image(utils: Utils) -> str: yield from utils.cloud_build_submit(NAME)
def bigquery_dataset(utils: Utils) -> str: yield from utils.bigquery_dataset(NAME)
def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str: yield from utils.pubsub_subscription(pubsub_topic, NAME)
def pubsub_topic(utils: Utils) -> str: yield from utils.pubsub_topic(NAME)
def bucket_name(utils: Utils) -> str: yield from utils.storage_bucket(NAME)
def test_flex_template_streaming_beam(utils: Utils, dataflow_job_id: str) -> None: # Wait until the dataflow job starts running successfully. # The job is cancelled as part of the fixture teardown to avoid leaking resources. utils.dataflow_jobs_wait(dataflow_job_id, target_states={"JOB_STATE_RUNNING"})
def test_tensorflow_minimal(utils: Utils, run_dataflow_job: str) -> None: # Wait until the job finishes. status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME)) assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status"
def build_image(utils: Utils) -> str: yield from utils.cloud_build_submit( image_name=NAME, config="build.yaml", substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"}, )