Beispiel #1
0
def run_job(
    job_class: Type[base_jobs.JobBase],
    sync: bool,
    namespace: Optional[str] = None,
    pipeline: Optional[beam.Pipeline] = None
) -> beam_job_models.BeamJobRunModel:
    """Runs the specified job synchronously.

    In other words, the function will wait for the job to finish running before
    returning a value.

    Args:
        job_class: type(base_jobs.JobBase). The type of job to run.
        sync: bool. Whether to run the job synchronously.
        namespace: str. The namespace in which models should be created.
        pipeline: Pipeline. The pipeline to run the job upon. If omitted, then a
            new pipeline will be used instead.

    Returns:
        BeamJobRun. Contains metadata related to the execution status of the
        job.

    Raises:
        RuntimeError. Failed to deploy given job to the Dataflow service.
    """
    if pipeline is None:
        pipeline = beam.Pipeline(
            runner=runners.DirectRunner()
            if sync else runners.DataflowRunner(),
            options=job_options.JobOptions(namespace=namespace))

    job = job_class(pipeline)
    job_name = job_class.__name__

    # NOTE: Exceptions raised within this context are logged and suppressed.
    with _job_bookkeeping_context(job_name) as run_model:
        _ = job.run() | job_io.PutResults(run_model.id)

        run_result = pipeline.run()

        if sync:
            run_result.wait_until_finish()
            run_model.latest_job_state = beam_job_models.BeamJobState.DONE.value

        elif run_result.has_job:
            run_model.dataflow_job_id = run_result.job_id()
            run_model.latest_job_state = run_result.state

        else:
            raise RuntimeError(
                'Failed to deploy %s to the Dataflow service. Please try again '
                'after a few minutes.' % job_name)

    # NDB operations in Beam do not properly update the context cache
    # (this cache is separate for every application thread), thus we clear
    # it ourselves.
    with datastore_services.get_ndb_context() as ndb_context:
        ndb_context.clear_cache()

    return run_model
Beispiel #2
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  pipeline = create_pipeline(FLAGS.task, FLAGS.dataset_paths,
                             FLAGS.csv_file_path, FLAGS.word_vocab_path,
                             FLAGS.max_token_per_label,
                             FLAGS.max_label_per_node, FLAGS.output_vocab_path,
                             FLAGS.output_tfexample_path)
  runners.DataflowRunner().run_pipeline(pipeline)
Beispiel #3
0
def run_type(pipeline, runner_type):
    """Executes pipeline with certain runner type."""
    if runner_type == RunnerType.DIRECT:
        print(
            "Running pipeline with direct runner this might take a long time!")
        return direct_runner.DirectRunner().run(pipeline)
    if runner_type == RunnerType.DATAFLOW:
        options = pipeline_options.PipelineOptions()
        gc_options = options.view_as(pipeline_options.GoogleCloudOptions)
        gc_options.project = FLAGS.gc_project
        gc_options.region = FLAGS.gc_region
        gc_options.job_name = FLAGS.gc_job_name
        gc_options.staging_location = FLAGS.gc_staging_location
        gc_options.temp_location = FLAGS.gc_temp_location
        setup = options.view_as(pipeline_options.SetupOptions)
        setup.extra_packages = FLAGS.extra_packages
        return runners.DataflowRunner().run(pipeline, options=options)
    raise ValueError(f"Unsupported runner type: {runner_type}")
def main(_):
  pipeline = create_pipeline(FLAGS.screen_id_file, FLAGS.input_dir,
                             FLAGS.output_path, FLAGS.clean_tf_example,
                             FLAGS.csv_label_file)
  runners.DataflowRunner().run_pipeline(pipeline)