def __init__(self, *args, **kwargs): super(PipelinedTestBase, self).__init__(*args, **kwargs) self.pipeline = test_pipeline.TestPipeline( runner=runners.DirectRunner(), options=test_pipeline.PipelineOptions( runtime_type_check=self.RUNTIME_TYPE_CHECK)) self._pipeline_context_stack = None
def test_beam_pipeline_sequence_example(self): with InMemoryTFRecord([ self._create_first_tf_example(), self._create_second_tf_example() ]) as input_tfrecord: runner = runners.DirectRunner() temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR')) output_tfrecord = os.path.join(temp_dir, 'output_tfrecord') sequence_key = six.ensure_binary('image/seq_id') max_num_elements = 10 num_shards = 1 pipeline = add_context_to_examples.construct_pipeline( input_tfrecord, output_tfrecord, sequence_key, max_num_elements_in_context_features=max_num_elements, num_shards=num_shards, output_type='tf_sequence_example') runner.run(pipeline) filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????') actual_output = [] record_iterator = tf.python_io.tf_record_iterator( path=filenames[0]) for record in record_iterator: actual_output.append(record) self.assertEqual(len(actual_output), 1) self.assert_expected_sequence_example([ tf.train.SequenceExample.FromString(tf_example) for tf_example in actual_output ])
def run_job( job_class: Type[base_jobs.JobBase], sync: bool, namespace: Optional[str] = None, pipeline: Optional[beam.Pipeline] = None ) -> beam_job_models.BeamJobRunModel: """Runs the specified job synchronously. In other words, the function will wait for the job to finish running before returning a value. Args: job_class: type(base_jobs.JobBase). The type of job to run. sync: bool. Whether to run the job synchronously. namespace: str. The namespace in which models should be created. pipeline: Pipeline. The pipeline to run the job upon. If omitted, then a new pipeline will be used instead. Returns: BeamJobRun. Contains metadata related to the execution status of the job. Raises: RuntimeError. Failed to deploy given job to the Dataflow service. """ if pipeline is None: pipeline = beam.Pipeline( runner=runners.DirectRunner() if sync else runners.DataflowRunner(), options=job_options.JobOptions(namespace=namespace)) job = job_class(pipeline) job_name = job_class.__name__ # NOTE: Exceptions raised within this context are logged and suppressed. with _job_bookkeeping_context(job_name) as run_model: _ = job.run() | job_io.PutResults(run_model.id) run_result = pipeline.run() if sync: run_result.wait_until_finish() run_model.latest_job_state = beam_job_models.BeamJobState.DONE.value elif run_result.has_job: run_model.dataflow_job_id = run_result.job_id() run_model.latest_job_state = run_result.state else: raise RuntimeError( 'Failed to deploy %s to the Dataflow service. Please try again ' 'after a few minutes.' % job_name) # NDB operations in Beam do not properly update the context cache # (this cache is separate for every application thread), thus we clear # it ourselves. with datastore_services.get_ndb_context() as ndb_context: ndb_context.clear_cache() return run_model
def main(_, runner=None): # must create before flags are used if runner is None: runner = runners.DirectRunner() tasks = [] for problem in problems.PROBLEMS_BY_NAME.values(): if (FLAGS.problem_filter and not re.search(FLAGS.problem_filter, problem.name)): continue if FLAGS.quick_run and problem.width * problem.height > 64**2: continue for seed in range(-1, FLAGS.num_seeds): if seed >= 0: tasks.append((problem.name, seed, 'cnn', 'lbfgs')) tasks.append((problem.name, seed, 'pixels', 'lbfgs')) tasks.append((problem.name, seed, 'pixels', 'oc')) tasks.append((problem.name, seed, 'pixels', 'mma')) if not tasks: raise RuntimeError('no tasks to run') pipeline = ( beam.Create(tasks) | beam.Map(run_optimization) | beam.Reshuffle() # don't fuse optimizations together | 'group seeds' >> beam.GroupByKey() | beam.Map(groupby_seeds) | 'group methods' >> beam.GroupByKey() | beam.Map(groupby_methods) | beam.combiners.ToList() | beam.Map(save_all_losses)) runner.run(pipeline)
def test_optimize_multiple_combine_globally(self): class MultipleCombines(beam.PTransform): def annotations(self): return {python_urns.APPLY_COMBINER_PACKING: b''} def expand(self, pcoll): _ = pcoll | 'mean-globally' >> combiners.Mean.Globally() _ = pcoll | 'count-globally' >> combiners.Count.Globally() _ = pcoll | 'largest-globally' >> core.CombineGlobally( combiners.Largest(1)) pipeline = beam.Pipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] _ = pipeline | Create(vals) | MultipleCombines() pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [ translations.pack_combiners, ], known_runner_urns=frozenset(), partial=True) # Tests that Pipeline.from_runner_api() does not throw an exception. runner = runners.DirectRunner() beam.Pipeline.from_runner_api( optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def test_optimize_empty_pipeline(self): pipeline = beam.Pipeline() pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [], known_runner_urns=frozenset(), partial=True) runner = runners.DirectRunner() beam.Pipeline.from_runner_api(optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def test_run_with_empty_model_getter(self): pipeline = test_pipeline.TestPipeline( runner=runners.DirectRunner(), options=job_options.JobOptions(model_getter=None)) self.assertRaisesRegexp( ValueError, 'JobOptions.model_getter must not be None', audit_jobs.AuditAllStorageModelsJob(pipeline).run)
def test_optimize_single_combine_globally(self): pipeline = beam.Pipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] _ = pipeline | Create(vals) | combiners.Count.Globally() pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [ translations.pack_combiners, ], known_runner_urns=frozenset(), partial=True) # Tests that Pipeline.from_runner_api() does not throw an exception. runner = runners.DirectRunner() beam.Pipeline.from_runner_api( optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def test_async_job(self) -> None: mock_run_result = mock.Mock() mock_run_result.has_job = True mock_run_result.job_id.return_value = '123' mock_run_result.state = 'PENDING' pipeline = beam.Pipeline( runner=runners.DirectRunner(), options=job_options.JobOptions(namespace=self.namespace)) with self.swap_to_always_return(pipeline, 'run', value=mock_run_result): run = jobs_manager.run_job(WorkingJob, False, pipeline=pipeline) self.assertEqual(run.dataflow_job_id, '123') self.assertEqual(run.latest_job_state, 'PENDING')
def main(_): """Runs the Beam pipeline that performs inference. Args: _: unused """ # must create before flags are used runner = runners.DirectRunner() dirname = os.path.dirname(FLAGS.detection_output_tfrecord) tf.io.gfile.makedirs(dirname) runner.run( construct_pipeline(FLAGS.detection_input_tfrecord, FLAGS.detection_output_tfrecord, FLAGS.detection_model_dir, FLAGS.confidence_threshold, FLAGS.num_shards))
def main(_): """Runs the Beam pipeline that performs inference. Args: _: unused """ # must create before flags are used runner = runners.DirectRunner() dirname = os.path.dirname(FLAGS.embedding_output_tfrecord) tf.io.gfile.makedirs(dirname) runner.run( construct_pipeline(FLAGS.embedding_input_tfrecord, FLAGS.embedding_output_tfrecord, FLAGS.embedding_model_dir, FLAGS.top_k_embedding_count, FLAGS.bottom_k_embedding_count, FLAGS.num_shards))
def test_async_job_that_does_not_start(self) -> None: mock_run_result = mock.Mock() mock_run_result.has_job = False mock_run_result.job_id.return_value = None mock_run_result.state = 'UNKNOWN' pipeline = beam.Pipeline( runner=runners.DirectRunner(), options=job_options.JobOptions(namespace=self.namespace)) with self.swap_to_always_return(pipeline, 'run', value=mock_run_result): run = jobs_manager.run_job(WorkingJob, False, pipeline=pipeline) self.assertIsNone(run.dataflow_job_id) self.assertEqual(run.latest_job_state, 'FAILED') result = beam_job_services.get_beam_job_run_result(run.id) self.assertIn('Failed to deploy WorkingJob', result.stderr)
def test_beam_pipeline(self): runner = runners.DirectRunner() num_frames = 1 temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR')) json_path = self._create_json_file(temp_dir, num_frames) output_tfrecord = temp_dir + '/output' self._write_random_images_to_directory(temp_dir, num_frames) pipeline = create_cococameratraps_tfexample_main.create_pipeline( temp_dir, json_path, output_tfrecord_prefix=output_tfrecord) runner.run(pipeline) filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????') actual_output = [] record_iterator = tf.python_io.tf_record_iterator(path=filenames[0]) for record in record_iterator: actual_output.append(record) self.assertEqual(len(actual_output), num_frames) self.assert_expected_example( tf.train.Example.FromString(actual_output[0]))
def test_optimize_multiple_combine_globally(self): pipeline = beam.Pipeline() vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | Create(vals) _ = pcoll | 'mean-globally' >> combiners.Mean.Globally() _ = pcoll | 'count-globally' >> combiners.Count.Globally() _ = pcoll | 'largest-globally' >> core.CombineGlobally(combiners.Largest(1)) pipeline_proto = pipeline.to_runner_api() optimized_pipeline_proto = translations.optimize_pipeline( pipeline_proto, [ translations.pack_combiners, ], known_runner_urns=frozenset(), partial=True) # Tests that Pipeline.from_runner_api() does not throw an exception. runner = runners.DirectRunner() beam.Pipeline.from_runner_api( optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
def main(_): """Runs the Beam pipeline that performs inference. Args: _: unused """ # must create before flags are used runner = runners.DirectRunner() dirname = os.path.dirname(FLAGS.output_tfrecord_prefix) tf.io.gfile.makedirs(dirname) runner.run( create_pipeline(image_directory=FLAGS.image_directory, input_annotations_file=FLAGS.input_annotations_file, output_tfrecord_prefix=FLAGS.output_tfrecord_prefix, num_images_per_shard=FLAGS.num_images_per_shard))
def run_job_sync( job_name: str, job_args: List[str], namespace: Optional[str] = None) -> beam_job_domain.BeamJobRun: """Runs the specified job synchronously. In other words, the function will wait for the job to finish running before returning a value. Args: job_name: str. The name of the job to run. job_args: list(str). The arguments to the job's run() method. namespace: str. The namespace in which models should be created. Returns: BeamJobRun. Contains metadata related to the execution status of the job. """ job_pipeline = beam.Pipeline( runner=runners.DirectRunner(), options=job_options.JobOptions(namespace=namespace)) job_class = registry.get_job_class_by_name(job_name) job = job_class(job_pipeline) run_model = beam_job_services.create_beam_job_run_model(job_name, job_args) try: with job_pipeline: unused_pdone = job.run(*job_args) | job_io.PutResults(run_model.id) except Exception as exception: run_model.latest_job_state = beam_job_models.BeamJobState.FAILED.value # If the pipeline fails to put the results into storage, then we'll # explicitly write them to storage by using the caught exception. result_model = beam_job_services.create_beam_job_run_result_model( run_model.id, '', python_utils.UNICODE(exception)) result_model.put() else: run_model.latest_job_state = beam_job_models.BeamJobState.DONE.value finally: run_model.put() return beam_job_services.get_beam_job_run_from_model(run_model)
def test_beam_pipeline(self): with InMemoryTFRecord([self._create_tf_example()]) as input_tfrecord: runner = runners.DirectRunner() temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR')) output_tfrecord = os.path.join(temp_dir, 'output_tfrecord') saved_model_path = self._export_saved_model() confidence_threshold = 0.8 num_shards = 1 pipeline = generate_detection_data.construct_pipeline( input_tfrecord, output_tfrecord, saved_model_path, confidence_threshold, num_shards) runner.run(pipeline) filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????') actual_output = [] record_iterator = tf.python_io.tf_record_iterator( path=filenames[0]) for record in record_iterator: actual_output.append(record) self.assertEqual(len(actual_output), 1) self.assert_expected_example( tf.train.Example.FromString(actual_output[0]))
def main(_): """Runs the Beam pipeline that builds context features. Args: _: unused """ # must create before flags are used runner = runners.DirectRunner() dirname = os.path.dirname(FLAGS.output_tfrecord) tf.io.gfile.makedirs(dirname) runner.run( construct_pipeline( FLAGS.input_tfrecord, FLAGS.output_tfrecord, FLAGS.sequence_key, FLAGS.time_horizon, FLAGS.subsample_context_features_rate, FLAGS.reduce_image_size, FLAGS.max_image_dimension, FLAGS.add_context_features, FLAGS.sorted_image_ids, FLAGS.image_ids_to_keep, FLAGS.keep_context_features_image_id_list, FLAGS.keep_only_positives, FLAGS.context_features_score_threshold, FLAGS.keep_only_positives_gt, FLAGS.max_num_elements_in_context_features, FLAGS.num_shards, FLAGS.output_type, FLAGS.max_clip_length))
def __init__(self, *args, **kwargs): super(PipelinedTestBase, self).__init__(*args, **kwargs) self.pipeline = test_pipeline.TestPipeline( runner=runners.DirectRunner(), options=test_pipeline.PipelineOptions(runtime_type_check=True)) self._close_stack = None
def main(_, runner=None): if runner is None: # must create before flags are used runner = runners.DirectRunner() # files dataset_path = FLAGS.dataset_path tf.io.gfile.makedirs(dataset_path) dataset_name = FLAGS.dataset_name metadata_path = os.path.join(dataset_path, dataset_name + '.metadata.json') records_path = os.path.join(dataset_path, dataset_name + '.tfrecord') num_shards = FLAGS.num_shards initialization_seed_offset = FLAGS.initialization_seed_offset # instantiate components of the system. equation_class = equations.matching_equation_type(FLAGS.equation_name, FLAGS.discretization) equation = equation_class(**ast.literal_eval(FLAGS.equation_kwargs)) simulation_grid = grids.Grid.from_period(FLAGS.simulation_grid_size, FLAGS.grid_length) output_grid = grids.Grid.from_period(FLAGS.output_grid_size, FLAGS.grid_length) initial_condition_steps = np.arange(0, FLAGS.total_time_steps, FLAGS.time_step_interval) builder_type = builders.DATASET_TYPES[FLAGS.dataset_type] builder = builder_type( equation, simulation_grid, output_grid, initial_condition_steps, example_num_time_steps=FLAGS.example_num_time_steps, ) flags_dict = flags_as_dict() seeds = [i + initialization_seed_offset for i in range(FLAGS.num_seeds)] rs_params = ast.literal_eval(FLAGS.random_state_params) def random_state(seed): return equation.random_state(simulation_grid, params=rs_params, seed=seed) def build_pipeline(root): """Builds a pipeline that generates and saves tfrecords and metadata.""" # NOTE(shoyer): we use Reshuffle transforms to ensure that Beam doesn't # consolidate expensive computations into fused tasks that cannot be # parallelized. generate_pipeline = (root | beam.Create(seeds) | 'random_state' >> beam.Map(random_state) | 'integrate_initial_conditions' >> beam.FlatMap( builder.integrate_for_initial_conditions) | 'split_integrate_tasks' >> beam.Reshuffle() | 'integrate_each_example' >> beam.Map( builder.integrate_each_example) | 'postprocess' >> beam.Map(builder.postprocess)) save_pipeline = ( # pylint: disable=unused-variable generate_pipeline | 'split_simulation_and_saving' >> beam.Reshuffle() | beam.Map(builder.convert_to_tf_example) | beam.io.tfrecordio.WriteToTFRecord(records_path, num_shards=num_shards)) statistics_pipeline = ( # pylint: disable=unused-variable generate_pipeline | 'items' >> beam.FlatMap(lambda state: state.items()) | 'calculate_statistics' >> beam.CombinePerKey( beamlib.MeanVarianceCombineFn()) | 'combine_statistics' >> beam.combiners.ToDict() | 'save_metadata' >> beam.Map( builder.save_metadata, records_path, metadata_path, num_shards=num_shards, flags=flags_dict, )) runner.run(build_pipeline)
def __init__(self, *args: Any, **kwargs: Any) -> None: super(PipelinedTestBase, self).__init__(*args, **kwargs) self.pipeline = test_pipeline.TestPipeline( runner=runners.DirectRunner(), options=job_options.JobOptions(namespace=self.namespace)) self._pipeline_context_stack: Optional[contextlib.ExitStack] = None