def _run_group_into_batches_and_get_step_properties( self, with_sharded_key, additional_properties): self.default_properties.append('--streaming') self.default_properties.append( '--experiment=enable_streaming_auto_sharding') for property in additional_properties: self.default_properties.append(property) runner = DataflowRunner() with beam.Pipeline(runner=runner, options=PipelineOptions( self.default_properties)) as p: # pylint: disable=expression-not-assigned input = p | beam.Create([('a', 1), ('a', 1), ('b', 3), ('b', 4)]) if with_sharded_key: (input | beam.GroupIntoBatches.WithShardedKey(2) | beam.Map(lambda key_values: (key_values[0].key, key_values[1]))) step_name = ( u'WithShardedKey/GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)' ) else: input | beam.GroupIntoBatches(2) step_name = u'GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)' return self._find_step(runner.job, step_name)['properties']
def expand( self, results: beam.PCollection[job_run_result.JobRunResult] ) -> beam.pvalue.PDone: """Writes the given job results to the NDB datastore. This overrides expand from parent class. Args: results: PCollection. Models, can also contain just one model. Returns: PCollection. An empty PCollection. """ return ( results # NOTE: Pylint is wrong. WithKeys() is a decorated function with a # different signature than the one it's defined with. | beam.WithKeys(None) # pylint: disable=no-value-for-parameter # GroupIntoBatches() requires (key, value) pairs as input, so we # give everything None keys and then immediately discard them. | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL) | beam.Values() # pylint: disable=no-value-for-parameter | beam.FlatMap(job_run_result.JobRunResult.accumulate) | beam.Map(self.create_beam_job_run_result_model, results.pipeline.options.namespace) | ndb_io.PutModels())
def expand(self, results): """Writes the given job results to the NDB datastore.""" return ( results # NOTE: Pylint is wrong. WithKeys() is a decorated function with a # different signature than the one it's defined with. | beam.WithKeys(None) # pylint: disable=no-value-for-parameter # GroupIntoBatches() requires (key, value) pairs as input, so we # give everything None keys and then immediately discard them. | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL) | beam.Values() | beam.FlatMap(job_run_result.JobRunResult.accumulate) | beam.Map(self.create_beam_job_run_result_model) | ndb_io.PutModels(self.datastoreio_stub))
def groupintobatches(test=None): # [START groupintobatches] import apache_beam as beam with beam.Pipeline() as pipeline: batches_with_keys = (pipeline | 'Create produce' >> beam.Create([ ('spring', '🍓'), ('spring', '🥕'), ('spring', '🍆'), ('spring', '🍅'), ('summer', '🥕'), ('summer', '🍅'), ('summer', '🌽'), ('fall', '🥕'), ('fall', '🍅'), ('winter', '🍆'), ]) | 'Group into batches' >> beam.GroupIntoBatches(3) | beam.Map(print)) # [END groupintobatches] if test: test(batches_with_keys)