コード例 #1
0
ファイル: job_test_utils.py プロジェクト: ReshuKumari/oppia
 def __init__(self, *args, **kwargs):
     super(PipelinedTestBase, self).__init__(*args, **kwargs)
     self.pipeline = test_pipeline.TestPipeline(
         runner=runners.DirectRunner(),
         options=test_pipeline.PipelineOptions(
             runtime_type_check=self.RUNTIME_TYPE_CHECK))
     self._pipeline_context_stack = None
 def test_beam_pipeline_sequence_example(self):
     with InMemoryTFRecord([
             self._create_first_tf_example(),
             self._create_second_tf_example()
     ]) as input_tfrecord:
         runner = runners.DirectRunner()
         temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
         output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
         sequence_key = six.ensure_binary('image/seq_id')
         max_num_elements = 10
         num_shards = 1
         pipeline = add_context_to_examples.construct_pipeline(
             input_tfrecord,
             output_tfrecord,
             sequence_key,
             max_num_elements_in_context_features=max_num_elements,
             num_shards=num_shards,
             output_type='tf_sequence_example')
         runner.run(pipeline)
         filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????')
         actual_output = []
         record_iterator = tf.python_io.tf_record_iterator(
             path=filenames[0])
         for record in record_iterator:
             actual_output.append(record)
         self.assertEqual(len(actual_output), 1)
         self.assert_expected_sequence_example([
             tf.train.SequenceExample.FromString(tf_example)
             for tf_example in actual_output
         ])
コード例 #3
0
ファイル: jobs_manager.py プロジェクト: vojtechjelinek/oppia
def run_job(
    job_class: Type[base_jobs.JobBase],
    sync: bool,
    namespace: Optional[str] = None,
    pipeline: Optional[beam.Pipeline] = None
) -> beam_job_models.BeamJobRunModel:
    """Runs the specified job synchronously.

    In other words, the function will wait for the job to finish running before
    returning a value.

    Args:
        job_class: type(base_jobs.JobBase). The type of job to run.
        sync: bool. Whether to run the job synchronously.
        namespace: str. The namespace in which models should be created.
        pipeline: Pipeline. The pipeline to run the job upon. If omitted, then a
            new pipeline will be used instead.

    Returns:
        BeamJobRun. Contains metadata related to the execution status of the
        job.

    Raises:
        RuntimeError. Failed to deploy given job to the Dataflow service.
    """
    if pipeline is None:
        pipeline = beam.Pipeline(
            runner=runners.DirectRunner()
            if sync else runners.DataflowRunner(),
            options=job_options.JobOptions(namespace=namespace))

    job = job_class(pipeline)
    job_name = job_class.__name__

    # NOTE: Exceptions raised within this context are logged and suppressed.
    with _job_bookkeeping_context(job_name) as run_model:
        _ = job.run() | job_io.PutResults(run_model.id)

        run_result = pipeline.run()

        if sync:
            run_result.wait_until_finish()
            run_model.latest_job_state = beam_job_models.BeamJobState.DONE.value

        elif run_result.has_job:
            run_model.dataflow_job_id = run_result.job_id()
            run_model.latest_job_state = run_result.state

        else:
            raise RuntimeError(
                'Failed to deploy %s to the Dataflow service. Please try again '
                'after a few minutes.' % job_name)

    # NDB operations in Beam do not properly update the context cache
    # (this cache is separate for every application thread), thus we clear
    # it ourselves.
    with datastore_services.get_ndb_context() as ndb_context:
        ndb_context.clear_cache()

    return run_model
コード例 #4
0
def main(_, runner=None):
    # must create before flags are used
    if runner is None:
        runner = runners.DirectRunner()

    tasks = []
    for problem in problems.PROBLEMS_BY_NAME.values():
        if (FLAGS.problem_filter
                and not re.search(FLAGS.problem_filter, problem.name)):
            continue

        if FLAGS.quick_run and problem.width * problem.height > 64**2:
            continue

        for seed in range(-1, FLAGS.num_seeds):
            if seed >= 0:
                tasks.append((problem.name, seed, 'cnn', 'lbfgs'))
            tasks.append((problem.name, seed, 'pixels', 'lbfgs'))
            tasks.append((problem.name, seed, 'pixels', 'oc'))
            tasks.append((problem.name, seed, 'pixels', 'mma'))

    if not tasks:
        raise RuntimeError('no tasks to run')

    pipeline = (
        beam.Create(tasks)
        | beam.Map(run_optimization)
        | beam.Reshuffle()  # don't fuse optimizations together
        | 'group seeds' >> beam.GroupByKey()
        | beam.Map(groupby_seeds)
        | 'group methods' >> beam.GroupByKey()
        | beam.Map(groupby_methods)
        | beam.combiners.ToList()
        | beam.Map(save_all_losses))
    runner.run(pipeline)
コード例 #5
0
  def test_optimize_multiple_combine_globally(self):
    class MultipleCombines(beam.PTransform):
      def annotations(self):
        return {python_urns.APPLY_COMBINER_PACKING: b''}

      def expand(self, pcoll):
        _ = pcoll | 'mean-globally' >> combiners.Mean.Globally()
        _ = pcoll | 'count-globally' >> combiners.Count.Globally()
        _ = pcoll | 'largest-globally' >> core.CombineGlobally(
            combiners.Largest(1))

    pipeline = beam.Pipeline()
    vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
    _ = pipeline | Create(vals) | MultipleCombines()
    pipeline_proto = pipeline.to_runner_api()
    optimized_pipeline_proto = translations.optimize_pipeline(
        pipeline_proto, [
            translations.pack_combiners,
        ],
        known_runner_urns=frozenset(),
        partial=True)
    # Tests that Pipeline.from_runner_api() does not throw an exception.
    runner = runners.DirectRunner()
    beam.Pipeline.from_runner_api(
        optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
コード例 #6
0
ファイル: translations_test.py プロジェクト: zakazai/beam
 def test_optimize_empty_pipeline(self):
     pipeline = beam.Pipeline()
     pipeline_proto = pipeline.to_runner_api()
     optimized_pipeline_proto = translations.optimize_pipeline(
         pipeline_proto, [], known_runner_urns=frozenset(), partial=True)
     runner = runners.DirectRunner()
     beam.Pipeline.from_runner_api(optimized_pipeline_proto, runner,
                                   pipeline_options.PipelineOptions())
コード例 #7
0
    def test_run_with_empty_model_getter(self):
        pipeline = test_pipeline.TestPipeline(
            runner=runners.DirectRunner(),
            options=job_options.JobOptions(model_getter=None))

        self.assertRaisesRegexp(
            ValueError, 'JobOptions.model_getter must not be None',
            audit_jobs.AuditAllStorageModelsJob(pipeline).run)
コード例 #8
0
 def test_optimize_single_combine_globally(self):
   pipeline = beam.Pipeline()
   vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
   _ = pipeline | Create(vals) | combiners.Count.Globally()
   pipeline_proto = pipeline.to_runner_api()
   optimized_pipeline_proto = translations.optimize_pipeline(
       pipeline_proto, [
           translations.pack_combiners,
       ],
       known_runner_urns=frozenset(),
       partial=True)
   # Tests that Pipeline.from_runner_api() does not throw an exception.
   runner = runners.DirectRunner()
   beam.Pipeline.from_runner_api(
       optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
コード例 #9
0
ファイル: jobs_manager_test.py プロジェクト: nlok5923/oppia
    def test_async_job(self) -> None:
        mock_run_result = mock.Mock()
        mock_run_result.has_job = True
        mock_run_result.job_id.return_value = '123'
        mock_run_result.state = 'PENDING'

        pipeline = beam.Pipeline(
            runner=runners.DirectRunner(),
            options=job_options.JobOptions(namespace=self.namespace))

        with self.swap_to_always_return(pipeline, 'run', value=mock_run_result):
            run = jobs_manager.run_job(WorkingJob, False, pipeline=pipeline)

        self.assertEqual(run.dataflow_job_id, '123')
        self.assertEqual(run.latest_job_state, 'PENDING')
コード例 #10
0
def main(_):
    """Runs the Beam pipeline that performs inference.

  Args:
    _: unused
  """
    # must create before flags are used
    runner = runners.DirectRunner()

    dirname = os.path.dirname(FLAGS.detection_output_tfrecord)
    tf.io.gfile.makedirs(dirname)
    runner.run(
        construct_pipeline(FLAGS.detection_input_tfrecord,
                           FLAGS.detection_output_tfrecord,
                           FLAGS.detection_model_dir,
                           FLAGS.confidence_threshold, FLAGS.num_shards))
コード例 #11
0
def main(_):
    """Runs the Beam pipeline that performs inference.

  Args:
    _: unused
  """
    # must create before flags are used
    runner = runners.DirectRunner()

    dirname = os.path.dirname(FLAGS.embedding_output_tfrecord)
    tf.io.gfile.makedirs(dirname)
    runner.run(
        construct_pipeline(FLAGS.embedding_input_tfrecord,
                           FLAGS.embedding_output_tfrecord,
                           FLAGS.embedding_model_dir,
                           FLAGS.top_k_embedding_count,
                           FLAGS.bottom_k_embedding_count, FLAGS.num_shards))
コード例 #12
0
ファイル: jobs_manager_test.py プロジェクト: nlok5923/oppia
    def test_async_job_that_does_not_start(self) -> None:
        mock_run_result = mock.Mock()
        mock_run_result.has_job = False
        mock_run_result.job_id.return_value = None
        mock_run_result.state = 'UNKNOWN'

        pipeline = beam.Pipeline(
            runner=runners.DirectRunner(),
            options=job_options.JobOptions(namespace=self.namespace))

        with self.swap_to_always_return(pipeline, 'run', value=mock_run_result):
            run = jobs_manager.run_job(WorkingJob, False, pipeline=pipeline)

        self.assertIsNone(run.dataflow_job_id)
        self.assertEqual(run.latest_job_state, 'FAILED')
        result = beam_job_services.get_beam_job_run_result(run.id)
        self.assertIn('Failed to deploy WorkingJob', result.stderr)
 def test_beam_pipeline(self):
     runner = runners.DirectRunner()
     num_frames = 1
     temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
     json_path = self._create_json_file(temp_dir, num_frames)
     output_tfrecord = temp_dir + '/output'
     self._write_random_images_to_directory(temp_dir, num_frames)
     pipeline = create_cococameratraps_tfexample_main.create_pipeline(
         temp_dir, json_path, output_tfrecord_prefix=output_tfrecord)
     runner.run(pipeline)
     filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????')
     actual_output = []
     record_iterator = tf.python_io.tf_record_iterator(path=filenames[0])
     for record in record_iterator:
         actual_output.append(record)
     self.assertEqual(len(actual_output), num_frames)
     self.assert_expected_example(
         tf.train.Example.FromString(actual_output[0]))
コード例 #14
0
 def test_optimize_multiple_combine_globally(self):
   pipeline = beam.Pipeline()
   vals = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
   pcoll = pipeline | Create(vals)
   _ = pcoll | 'mean-globally' >> combiners.Mean.Globally()
   _ = pcoll | 'count-globally' >> combiners.Count.Globally()
   _ = pcoll | 'largest-globally' >> core.CombineGlobally(combiners.Largest(1))
   pipeline_proto = pipeline.to_runner_api()
   optimized_pipeline_proto = translations.optimize_pipeline(
       pipeline_proto, [
           translations.pack_combiners,
       ],
       known_runner_urns=frozenset(),
       partial=True)
   # Tests that Pipeline.from_runner_api() does not throw an exception.
   runner = runners.DirectRunner()
   beam.Pipeline.from_runner_api(
       optimized_pipeline_proto, runner, pipeline_options.PipelineOptions())
コード例 #15
0
def main(_):
    """Runs the Beam pipeline that performs inference.

  Args:
    _: unused
  """

    # must create before flags are used
    runner = runners.DirectRunner()

    dirname = os.path.dirname(FLAGS.output_tfrecord_prefix)
    tf.io.gfile.makedirs(dirname)

    runner.run(
        create_pipeline(image_directory=FLAGS.image_directory,
                        input_annotations_file=FLAGS.input_annotations_file,
                        output_tfrecord_prefix=FLAGS.output_tfrecord_prefix,
                        num_images_per_shard=FLAGS.num_images_per_shard))
コード例 #16
0
ファイル: jobs_manager.py プロジェクト: sajalasati/oppia
def run_job_sync(
        job_name: str,
        job_args: List[str],
        namespace: Optional[str] = None) -> beam_job_domain.BeamJobRun:
    """Runs the specified job synchronously.

    In other words, the function will wait for the job to finish running before
    returning a value.

    Args:
        job_name: str. The name of the job to run.
        job_args: list(str). The arguments to the job's run() method.
        namespace: str. The namespace in which models should be created.

    Returns:
        BeamJobRun. Contains metadata related to the execution status of the
        job.
    """
    job_pipeline = beam.Pipeline(
        runner=runners.DirectRunner(),
        options=job_options.JobOptions(namespace=namespace))
    job_class = registry.get_job_class_by_name(job_name)

    job = job_class(job_pipeline)
    run_model = beam_job_services.create_beam_job_run_model(job_name, job_args)

    try:
        with job_pipeline:
            unused_pdone = job.run(*job_args) | job_io.PutResults(run_model.id)
    except Exception as exception:
        run_model.latest_job_state = beam_job_models.BeamJobState.FAILED.value
        # If the pipeline fails to put the results into storage, then we'll
        # explicitly write them to storage by using the caught exception.
        result_model = beam_job_services.create_beam_job_run_result_model(
            run_model.id, '', python_utils.UNICODE(exception))
        result_model.put()
    else:
        run_model.latest_job_state = beam_job_models.BeamJobState.DONE.value
    finally:
        run_model.put()

    return beam_job_services.get_beam_job_run_from_model(run_model)
コード例 #17
0
 def test_beam_pipeline(self):
     with InMemoryTFRecord([self._create_tf_example()]) as input_tfrecord:
         runner = runners.DirectRunner()
         temp_dir = tempfile.mkdtemp(dir=os.environ.get('TEST_TMPDIR'))
         output_tfrecord = os.path.join(temp_dir, 'output_tfrecord')
         saved_model_path = self._export_saved_model()
         confidence_threshold = 0.8
         num_shards = 1
         pipeline = generate_detection_data.construct_pipeline(
             input_tfrecord, output_tfrecord, saved_model_path,
             confidence_threshold, num_shards)
         runner.run(pipeline)
         filenames = tf.io.gfile.glob(output_tfrecord + '-?????-of-?????')
         actual_output = []
         record_iterator = tf.python_io.tf_record_iterator(
             path=filenames[0])
         for record in record_iterator:
             actual_output.append(record)
         self.assertEqual(len(actual_output), 1)
         self.assert_expected_example(
             tf.train.Example.FromString(actual_output[0]))
コード例 #18
0
def main(_):
    """Runs the Beam pipeline that builds context features.

  Args:
    _: unused
  """
    # must create before flags are used
    runner = runners.DirectRunner()

    dirname = os.path.dirname(FLAGS.output_tfrecord)
    tf.io.gfile.makedirs(dirname)
    runner.run(
        construct_pipeline(
            FLAGS.input_tfrecord, FLAGS.output_tfrecord, FLAGS.sequence_key,
            FLAGS.time_horizon, FLAGS.subsample_context_features_rate,
            FLAGS.reduce_image_size, FLAGS.max_image_dimension,
            FLAGS.add_context_features, FLAGS.sorted_image_ids,
            FLAGS.image_ids_to_keep, FLAGS.keep_context_features_image_id_list,
            FLAGS.keep_only_positives, FLAGS.context_features_score_threshold,
            FLAGS.keep_only_positives_gt,
            FLAGS.max_num_elements_in_context_features, FLAGS.num_shards,
            FLAGS.output_type, FLAGS.max_clip_length))
コード例 #19
0
 def __init__(self, *args, **kwargs):
     super(PipelinedTestBase, self).__init__(*args, **kwargs)
     self.pipeline = test_pipeline.TestPipeline(
         runner=runners.DirectRunner(),
         options=test_pipeline.PipelineOptions(runtime_type_check=True))
     self._close_stack = None
コード例 #20
0
def main(_, runner=None):
    if runner is None:
        # must create before flags are used
        runner = runners.DirectRunner()

    # files
    dataset_path = FLAGS.dataset_path
    tf.io.gfile.makedirs(dataset_path)
    dataset_name = FLAGS.dataset_name
    metadata_path = os.path.join(dataset_path, dataset_name + '.metadata.json')
    records_path = os.path.join(dataset_path, dataset_name + '.tfrecord')
    num_shards = FLAGS.num_shards
    initialization_seed_offset = FLAGS.initialization_seed_offset

    # instantiate components of the system.
    equation_class = equations.matching_equation_type(FLAGS.equation_name,
                                                      FLAGS.discretization)
    equation = equation_class(**ast.literal_eval(FLAGS.equation_kwargs))

    simulation_grid = grids.Grid.from_period(FLAGS.simulation_grid_size,
                                             FLAGS.grid_length)
    output_grid = grids.Grid.from_period(FLAGS.output_grid_size,
                                         FLAGS.grid_length)
    initial_condition_steps = np.arange(0, FLAGS.total_time_steps,
                                        FLAGS.time_step_interval)

    builder_type = builders.DATASET_TYPES[FLAGS.dataset_type]
    builder = builder_type(
        equation,
        simulation_grid,
        output_grid,
        initial_condition_steps,
        example_num_time_steps=FLAGS.example_num_time_steps,
    )

    flags_dict = flags_as_dict()

    seeds = [i + initialization_seed_offset for i in range(FLAGS.num_seeds)]
    rs_params = ast.literal_eval(FLAGS.random_state_params)

    def random_state(seed):
        return equation.random_state(simulation_grid,
                                     params=rs_params,
                                     seed=seed)

    def build_pipeline(root):
        """Builds a pipeline that generates and saves tfrecords and metadata."""

        # NOTE(shoyer): we use Reshuffle transforms to ensure that Beam doesn't
        # consolidate expensive computations into fused tasks that cannot be
        # parallelized.
        generate_pipeline = (root
                             | beam.Create(seeds)
                             | 'random_state' >> beam.Map(random_state)
                             | 'integrate_initial_conditions' >> beam.FlatMap(
                                 builder.integrate_for_initial_conditions)
                             | 'split_integrate_tasks' >> beam.Reshuffle()
                             | 'integrate_each_example' >> beam.Map(
                                 builder.integrate_each_example)
                             | 'postprocess' >> beam.Map(builder.postprocess))

        save_pipeline = (  # pylint: disable=unused-variable
            generate_pipeline
            | 'split_simulation_and_saving' >> beam.Reshuffle()
            | beam.Map(builder.convert_to_tf_example)
            | beam.io.tfrecordio.WriteToTFRecord(records_path,
                                                 num_shards=num_shards))

        statistics_pipeline = (  # pylint: disable=unused-variable
            generate_pipeline
            | 'items' >> beam.FlatMap(lambda state: state.items())
            | 'calculate_statistics' >> beam.CombinePerKey(
                beamlib.MeanVarianceCombineFn())
            | 'combine_statistics' >> beam.combiners.ToDict()
            | 'save_metadata' >> beam.Map(
                builder.save_metadata,
                records_path,
                metadata_path,
                num_shards=num_shards,
                flags=flags_dict,
            ))

    runner.run(build_pipeline)
コード例 #21
0
ファイル: job_test_utils.py プロジェクト: jameesjohn/oppia
 def __init__(self, *args: Any, **kwargs: Any) -> None:
     super(PipelinedTestBase, self).__init__(*args, **kwargs)
     self.pipeline = test_pipeline.TestPipeline(
         runner=runners.DirectRunner(),
         options=job_options.JobOptions(namespace=self.namespace))
     self._pipeline_context_stack: Optional[contextlib.ExitStack] = None