def _BatchElements(pcoll): # pylint: disable=invalid-name """Batches elements either automatically or to the given batch_size.""" desired_batch_size = Context.get_desired_batch_size() kwargs = dict( min_batch_size=desired_batch_size, max_batch_size=desired_batch_size ) if desired_batch_size is not None else {} return pcoll | 'BatchElements' >> util.BatchElements(**kwargs)
def run(args): """Runs the embedding generation Beam pipeline.""" if tf.io.gfile.exists(args.embed_output_dir): print('Removing embedding output directory...') tf.io.gfile.rmtree(args.embed_output_dir) print('Creating empty output directory...') tf.io.gfile.makedirs(args.embed_output_dir) options = beam.options.pipeline_options.PipelineOptions(**vars(args)) original_dim = hub.load(args.module_url)(['']).shape[1] random_projection_matrix = generate_random_projection_weights( original_dim, args.projected_dim, args.embed_output_dir) print('Starting the Beam pipeline...') with beam.Pipeline(runner=_RUNNER, options=options) as pipeline: _ = ( pipeline | 'Read sentences from files' >> beam.io.ReadFromText(file_pattern=args.data_file_pattern) | 'Batch elements' >> util.BatchElements( min_batch_size=_BATCH_SIZE / 2, max_batch_size=_BATCH_SIZE) | 'Generate embeddings' >> beam.Map( generate_embeddings, args.module_url, random_projection_matrix) | 'Encode to tf example' >> beam.FlatMap(to_tf_example) | 'Write to TFRecords files' >> beam.io.WriteToTFRecord( file_path_prefix='{}/emb'.format(args.embed_output_dir), file_name_suffix='.tfrecords') ) print('Beam pipeline completed.')
def test_constant_batch(self): # Assumes a single bundle... with TestPipeline() as p: res = (p | beam.Create(range(35)) | util.BatchElements(min_batch_size=10, max_batch_size=10) | beam.Map(len)) assert_that(res, equal_to([10, 10, 10, 5]))
def test_grows_to_max_batch(self): # Assumes a single bundle... with TestPipeline() as p: res = (p | beam.Create(range(164)) | util.BatchElements( min_batch_size=1, max_batch_size=50, clock=FakeClock()) | beam.Map(len)) assert_that(res, equal_to([1, 1, 2, 4, 8, 16, 32, 50, 50]))
def expand(self, pvalue): return (pvalue | FlatMap(self._create_image_annotation_pairs) | util.BatchElements(min_batch_size=self.min_batch_size, max_batch_size=self.max_batch_size) | ParDo( _ImageAnnotateFn(features=self.features, retry=self.retry, timeout=self.timeout, client_options=self.client_options, metadata=self.metadata)))
def test_windowed_batches(self): # Assumes a single bundle, in order... with TestPipeline() as p: res = ( p | beam.Create(range(47)) | beam.Map(lambda t: window.TimestampedValue(t, t)) | beam.WindowInto(window.FixedWindows(30)) | util.BatchElements( min_batch_size=5, max_batch_size=10, clock=FakeClock()) | beam.Map(len)) assert_that(res, equal_to([ 5, 5, 10, 10, # elements in [0, 30) 10, 7, # elements in [30, 47) ]))