Esempio n. 1
0
def _BatchElements(pcoll):  # pylint: disable=invalid-name
  """Batches elements either automatically or to the given batch_size."""
  desired_batch_size = Context.get_desired_batch_size()
  kwargs = dict(
      min_batch_size=desired_batch_size, max_batch_size=desired_batch_size
  ) if desired_batch_size is not None else {}
  return pcoll | 'BatchElements' >> util.BatchElements(**kwargs)
Esempio n. 2
0
def run(args):
  """Runs the embedding generation Beam pipeline."""

  if tf.io.gfile.exists(args.embed_output_dir):
    print('Removing embedding output directory...')
    tf.io.gfile.rmtree(args.embed_output_dir)
  print('Creating empty output directory...')
  tf.io.gfile.makedirs(args.embed_output_dir)

  options = beam.options.pipeline_options.PipelineOptions(**vars(args))

  original_dim = hub.load(args.module_url)(['']).shape[1]

  random_projection_matrix = generate_random_projection_weights(
      original_dim, args.projected_dim, args.embed_output_dir)

  print('Starting the Beam pipeline...')
  with beam.Pipeline(runner=_RUNNER, options=options) as pipeline:
    _ = (
        pipeline
        | 'Read sentences from files' >>
        beam.io.ReadFromText(file_pattern=args.data_file_pattern)
        | 'Batch elements' >> util.BatchElements(
            min_batch_size=_BATCH_SIZE / 2, max_batch_size=_BATCH_SIZE)
        | 'Generate embeddings' >> beam.Map(
            generate_embeddings, args.module_url, random_projection_matrix)
        | 'Encode to tf example' >> beam.FlatMap(to_tf_example)
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
            file_path_prefix='{}/emb'.format(args.embed_output_dir),
            file_name_suffix='.tfrecords')
    )

  print('Beam pipeline completed.')
Esempio n. 3
0
 def test_constant_batch(self):
     # Assumes a single bundle...
     with TestPipeline() as p:
         res = (p
                | beam.Create(range(35))
                | util.BatchElements(min_batch_size=10, max_batch_size=10)
                | beam.Map(len))
         assert_that(res, equal_to([10, 10, 10, 5]))
Esempio n. 4
0
 def test_grows_to_max_batch(self):
     # Assumes a single bundle...
     with TestPipeline() as p:
         res = (p
                | beam.Create(range(164))
                | util.BatchElements(
                    min_batch_size=1, max_batch_size=50, clock=FakeClock())
                | beam.Map(len))
         assert_that(res, equal_to([1, 1, 2, 4, 8, 16, 32, 50, 50]))
Esempio n. 5
0
 def expand(self, pvalue):
     return (pvalue
             | FlatMap(self._create_image_annotation_pairs)
             | util.BatchElements(min_batch_size=self.min_batch_size,
                                  max_batch_size=self.max_batch_size)
             | ParDo(
                 _ImageAnnotateFn(features=self.features,
                                  retry=self.retry,
                                  timeout=self.timeout,
                                  client_options=self.client_options,
                                  metadata=self.metadata)))
Esempio n. 6
0
 def test_windowed_batches(self):
   # Assumes a single bundle, in order...
   with TestPipeline() as p:
     res = (
         p
         | beam.Create(range(47))
         | beam.Map(lambda t: window.TimestampedValue(t, t))
         | beam.WindowInto(window.FixedWindows(30))
         | util.BatchElements(
             min_batch_size=5, max_batch_size=10, clock=FakeClock())
         | beam.Map(len))
     assert_that(res, equal_to([
         5, 5, 10, 10,  # elements in [0, 30)
         10, 7,         # elements in [30, 47)
     ]))