def run_job( job_class: Type[base_jobs.JobBase], sync: bool, namespace: Optional[str] = None, pipeline: Optional[beam.Pipeline] = None ) -> beam_job_models.BeamJobRunModel: """Runs the specified job synchronously. In other words, the function will wait for the job to finish running before returning a value. Args: job_class: type(base_jobs.JobBase). The type of job to run. sync: bool. Whether to run the job synchronously. namespace: str. The namespace in which models should be created. pipeline: Pipeline. The pipeline to run the job upon. If omitted, then a new pipeline will be used instead. Returns: BeamJobRun. Contains metadata related to the execution status of the job. Raises: RuntimeError. Failed to deploy given job to the Dataflow service. """ if pipeline is None: pipeline = beam.Pipeline( runner=runners.DirectRunner() if sync else runners.DataflowRunner(), options=job_options.JobOptions(namespace=namespace)) job = job_class(pipeline) job_name = job_class.__name__ # NOTE: Exceptions raised within this context are logged and suppressed. with _job_bookkeeping_context(job_name) as run_model: _ = job.run() | job_io.PutResults(run_model.id) run_result = pipeline.run() if sync: run_result.wait_until_finish() run_model.latest_job_state = beam_job_models.BeamJobState.DONE.value elif run_result.has_job: run_model.dataflow_job_id = run_result.job_id() run_model.latest_job_state = run_result.state else: raise RuntimeError( 'Failed to deploy %s to the Dataflow service. Please try again ' 'after a few minutes.' % job_name) # NDB operations in Beam do not properly update the context cache # (this cache is separate for every application thread), thus we clear # it ourselves. with datastore_services.get_ndb_context() as ndb_context: ndb_context.clear_cache() return run_model
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') pipeline = create_pipeline(FLAGS.task, FLAGS.dataset_paths, FLAGS.csv_file_path, FLAGS.word_vocab_path, FLAGS.max_token_per_label, FLAGS.max_label_per_node, FLAGS.output_vocab_path, FLAGS.output_tfexample_path) runners.DataflowRunner().run_pipeline(pipeline)
def run_type(pipeline, runner_type): """Executes pipeline with certain runner type.""" if runner_type == RunnerType.DIRECT: print( "Running pipeline with direct runner this might take a long time!") return direct_runner.DirectRunner().run(pipeline) if runner_type == RunnerType.DATAFLOW: options = pipeline_options.PipelineOptions() gc_options = options.view_as(pipeline_options.GoogleCloudOptions) gc_options.project = FLAGS.gc_project gc_options.region = FLAGS.gc_region gc_options.job_name = FLAGS.gc_job_name gc_options.staging_location = FLAGS.gc_staging_location gc_options.temp_location = FLAGS.gc_temp_location setup = options.view_as(pipeline_options.SetupOptions) setup.extra_packages = FLAGS.extra_packages return runners.DataflowRunner().run(pipeline, options=options) raise ValueError(f"Unsupported runner type: {runner_type}")
def main(_): pipeline = create_pipeline(FLAGS.screen_id_file, FLAGS.input_dir, FLAGS.output_path, FLAGS.clean_tf_example, FLAGS.csv_label_file) runners.DataflowRunner().run_pipeline(pipeline)