def expand(self, blog_model_pcoll): return ( blog_model_pcoll | 'Discard models with empty property value' >> (beam.Filter(lambda model: self.get_property_value(model) != '')) | 'Generate (%s, model) key value pairs' % self._property_name >> (beam.WithKeys(self.get_property_value)) # pylint: disable=no-value-for-parameter | 'Group pairs by their %s' % self._property_name >> (beam.GroupByKey()) | 'Discard %s key' % self._property_name >> beam.Values() # pylint: disable=no-value-for-parameter | 'Discard models with unique %s' % self._property_name >> (beam.Filter(lambda models: len(models) > 1)))
def expand(self, results): """Writes the given job results to the NDB datastore.""" return ( results # NOTE: Pylint is wrong. WithKeys() is a decorated function with a # different signature than the one it's defined with. | beam.WithKeys(None) # pylint: disable=no-value-for-parameter # GroupIntoBatches() requires (key, value) pairs as input, so we # give everything None keys and then immediately discard them. | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL) | beam.Values() | beam.FlatMap(job_run_result.JobRunResult.accumulate) | beam.Map(self.create_beam_job_run_result_model) | ndb_io.PutModels(self.datastoreio_stub))
def run(options: pipeline_options.PipelineOptions): p = beam.Pipeline(options=options) # Read in the CSV file input_data = read_data( p, options.view_as(CovidTrackingPipelineOptions).input_file) # Analyze the data: Find columns are present in every row, and columns that # aren't. column_information = beam.pvalue.AsSingleton( input_data | covidpipe.datasource.FindEmptyAndNonEmptyColumns()) # Get columns from our rows, and also ensure we get the main columns of # interest. full_data = select_wanted_columns(input_data, column_information, ['positive', 'negative']) # Filter out data points without 'positive' field. This is because they are # not valuable for our analysis. filtered_data = full_data | 'FilterMissingPositive' >> beam.Filter( lambda x: 'positive' in x) # For each state, let's get an iterable of per_state_iterables = (filtered_data | beam.WithKeys(lambda x: x['state']) | beam.GroupByKey() | beam.Values()) # Find 7-day spikes per state state_spikes = (per_state_iterables | beam.ParDo(FindStateSpikesFn())) # Write spikes to an output (state_spikes | beam.Map(json.dumps) | beam.io.WriteToText( options.view_as(CovidTrackingPipelineOptions).spikes_output_file)) result = p.run()
def run( input_subscription: str, output_table: str, window_interval_sec: int = 60, beam_args: List[str] = None, ) -> None: """Build and run the pipeline.""" options = PipelineOptions(beam_args, save_main_session=True, streaming=True) with beam.Pipeline(options=options) as pipeline: messages = ( pipeline | "Read from Pub/Sub" >> beam.io.ReadFromPubSub( subscription=input_subscription).with_output_types(bytes) | "UTF-8 bytes to string" >> beam.Map(lambda msg: msg.decode("utf-8")) | "Parse JSON messages" >> beam.Map(parse_json_message) | "Fixed-size windows" >> beam.WindowInto( window.FixedWindows(window_interval_sec, 0)) | "Add URL keys" >> beam.WithKeys(lambda msg: msg["url"]) | "Group by URLs" >> beam.GroupByKey() | "Get statistics" >> beam.MapTuple( lambda url, messages: { "url": url, "num_reviews": len(messages), "score": sum(msg["score"] for msg in messages) / len(messages), "first_date": min(msg["processing_time"] for msg in messages), "last_date": max(msg["processing_time"] for msg in messages), })) # Output the results into BigQuery table. _ = messages | "Write to Big Query" >> beam.io.WriteToBigQuery( output_table, schema=SCHEMA)
def expand(self, entities): """Writes the given job results to the NDB datastore. This overrides expand from parent class. Args: entities: PCollection. Models, can also contain just one model. Returns: PCollection. An empty PCollection. """ return ( entities # NOTE: Pylint is wrong. WithKeys() is a decorated function with a # different signature than the one it's defined with. | beam.WithKeys(None) # pylint: disable=no-value-for-parameter # GroupIntoBatches() requires (key, value) pairs as input, so we # give everything None keys and then immediately discard them. | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL) | beam.Values() | beam.FlatMap(job_run_result.JobRunResult.accumulate) | beam.Map(self.create_beam_job_run_result_model, entities.pipeline.options.namespace) | ndb_io.PutModels())
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of results from the skill migration. Returns: PCollection. A PCollection of results from the skill migration. """ unmigrated_skill_models = ( self.pipeline | 'Get all non-deleted skill models' >> (ndb_io.GetModels(skill_models.SkillModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add skill model ID' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda skill_model: skill_model.id)) skill_summary_models = ( self.pipeline | 'Get all non-deleted skill summary models' >> (ndb_io.GetModels(skill_models.SkillSummaryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add skill summary ID' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda skill_summary_model: skill_summary_model.id)) migrated_skill_results = (unmigrated_skill_models | 'Transform and migrate model' >> beam.MapTuple(self._migrate_skill)) migrated_skills = ( migrated_skill_results | 'Filter oks' >> beam.Filter(lambda result_item: result_item.is_ok()) | 'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap())) migrated_skill_job_run_results = ( migrated_skill_results | 'Generate results for migration' >> (job_result_transforms.ResultsToJobRunResults('SKILL PROCESSED'))) skill_changes = (unmigrated_skill_models | 'Generate skill changes' >> beam.FlatMapTuple( self._generate_skill_changes)) skill_objects_list = ( { 'skill_model': unmigrated_skill_models, 'skill_summary_model': skill_summary_models, 'skill': migrated_skills, 'skill_changes': skill_changes } | 'Merge objects' >> beam.CoGroupByKey() | 'Get rid of ID' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Remove unmigrated skills' >> beam.Filter( lambda x: len(x['skill_changes']) > 0 and len(x['skill']) > 0) | 'Reorganize the skill objects' >> beam.Map( lambda objects: { 'skill_model': objects['skill_model'][0], 'skill_summary_model': objects['skill_summary_model'][0], 'skill': objects['skill'][0], 'skill_changes': objects['skill_changes'] })) skill_objects_list_job_run_results = ( skill_objects_list | 'Transform skill objects into job run results' >> (job_result_transforms.CountObjectsToJobRunResult('SKILL MIGRATED') )) cache_deletion_job_run_results = ( skill_objects_list | 'Delete skill from cache' >> beam.Map(lambda skill_object: self._delete_skill_from_cache( skill_object['skill'])) | 'Generate results for cache deletion' >> (job_result_transforms.ResultsToJobRunResults('CACHE DELETION'))) skill_models_to_put = ( skill_objects_list | 'Generate skill models to put' >> beam.FlatMap(lambda skill_objects: self._update_skill( skill_objects['skill_model'], skill_objects['skill'], skill_objects['skill_changes'], ))) skill_summary_models_to_put = ( skill_objects_list | 'Generate skill summary models to put' >> beam.Map(lambda skill_objects: self._update_skill_summary( skill_objects['skill'], skill_objects['skill_summary_model']))) unused_put_results = ( (skill_models_to_put, skill_summary_models_to_put) | 'Merge models' >> beam.Flatten() | 'Put models into the datastore' >> ndb_io.PutModels()) return ( (cache_deletion_job_run_results, migrated_skill_job_run_results, skill_objects_list_job_run_results) | beam.Flatten())
# distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # beam-playground: # name: WithKeys # description: Task from katas to convert each fruit name into a KV of its first letter and itself. # multifile: false # categories: # - Combiners import apache_beam as beam from log_elements import LogElements with beam.Pipeline() as p: (p | beam.Create(['apple', 'banana', 'cherry', 'durian', 'guava', 'melon']) | beam.WithKeys(lambda word: word[0:1]) | LogElements())
def run( project: str, region: str, cloud_storage_path: str, bigquery_dataset: str, bigquery_table: str, ai_platform_name_prefix: str, min_images_per_class: int, max_images_per_class: int, budget_milli_node_hours: int, pipeline_options: Optional[PipelineOptions] = None, ) -> None: """Creates a balanced dataset and signals AI Platform to train a model. Args: project: Google Cloud Project ID. region: Location for AI Platform resources. bigquery_dataset: Dataset ID for the images database, the dataset must exist. bigquery_table: Table ID for the images database, the table must exist. ai_platform_name_prefix: Name prefix for AI Platform resources. min_images_per_class: Minimum number of images required per class for training. max_images_per_class: Maximum number of images allowed per class for training. budget_milli_node_hours: Training budget. pipeline_options: PipelineOptions for Apache Beam. """ with beam.Pipeline(options=pipeline_options) as pipeline: images = ( pipeline | "Read images info" >> beam.io.ReadFromBigQuery( dataset=bigquery_dataset, table=bigquery_table) | "Key by category" >> beam.WithKeys(lambda x: x["category"]) | "Random samples" >> beam.combiners.Sample.FixedSizePerKey(max_images_per_class) | "Remove key" >> beam.Values() | "Discard small samples" >> beam.Filter(lambda sample: len(sample) >= min_images_per_class) | "Flatten elements" >> beam.FlatMap(lambda sample: sample) | "Get image" >> beam.FlatMap(get_image, cloud_storage_path)) dataset_csv_filename = f"{cloud_storage_path}/dataset.csv" dataset_csv_file = ( pipeline | "Dataset filename" >> beam.Create([dataset_csv_filename]) | "Write dataset file" >> beam.Map( write_dataset_csv_file, images=beam.pvalue.AsIter(images))) if ai_platform_name_prefix: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") (dataset_csv_file | "Create dataset" >> beam.Map( create_dataset, project=project, region=region, dataset_name=f"{ai_platform_name_prefix}_{timestamp}", ) | "Import images" >> beam.MapTuple(import_images_to_dataset) | "Train model" >> beam.Map( train_model, project=project, region=region, model_name=f"{ai_platform_name_prefix}_{timestamp}", budget_milli_node_hours=budget_milli_node_hours, ))
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') row_conflicts = Metrics.counter('main', 'row_conflicts') multiple_histograms_for_row = Metrics.counter( 'main', 'multiple_histograms_for_row') orphaned_histogram = Metrics.counter('main', 'orphaned_histogram') """ CREATE TABLE `chromeperf.chromeperf_dashboard_rows.<MASTER>` (revision INT64 NOT NULL, value FLOAT64 NOT NULL, std_error FLOAT64, `timestamp` TIMESTAMP NOT NULL, master STRING NOT NULL, bot STRING NOT NULL, measurement STRING, test STRING NOT NULL, properties STRING, sample_values ARRAY<FLOAT64>) PARTITION BY DATE(`timestamp`) CLUSTER BY master, bot, measurement; """ # pylint: disable=pointless-string-statement bq_row_schema = { 'fields': [ { 'name': 'revision', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'value', 'type': 'FLOAT', 'mode': 'REQUIRED' }, { 'name': 'std_error', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'master', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bot', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'measurement', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'properties', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'sample_values', 'type': 'FLOAT', 'mode': 'REPEATED' }, ] } def RowEntityToRowDict(entity): entities_read.inc() try: d = { 'revision': entity.key.id, 'value': FloatHack(entity['value']), 'std_error': FloatHack(entity.get('error')), 'timestamp': entity['timestamp'].isoformat(), 'test': entity.key.parent.name, } # Add the expando properties as a JSON-encoded dict. properties = {} for key, value in entity.items(): if key in d or key in ['parent_test', 'error']: # skip properties with dedicated columns. continue if isinstance(value, float): value = FloatHack(value) properties[key] = value d['properties'] = json.dumps(properties) if properties else None # Add columns derived from test: master, bot. test_path_parts = d['test'].split('/', 2) if len(test_path_parts) >= 3: d['master'] = test_path_parts[0] d['bot'] = test_path_parts[1] d['measurement'] = '/'.join(test_path_parts[2:]) return [d] except KeyError: logging.getLogger().exception('Failed to convert Row') failed_entity_transforms.inc() return [] row_query_params = dict(project=project, kind='Row') row_entities = ( p | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore( row_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) row_dicts = (row_entities | 'ConvertEntityToDict(Row)' >> FlatMap(RowEntityToRowDict)) # The sample_values are not found in the Row entity. So we have to fetch all # the corresponding Histogram entities and join them with our collection of # Rows (by using test + revision as the join key). We also need to unpack the # sample values arrays out of the zlib-compressed JSON stored in the # Histogram's "data" property. def HistogramEntityToDict(entity): """Returns dicts with keys: 'test', 'revision', 'sample_values'.""" entities_read.inc() try: data = entity['data'] except KeyError: logging.getLogger().exception('Histogram missing "data" field') failed_entity_transforms.inc() return [] try: json_str = zlib.decompress(data) except zlib.error: logging.getLogger().exception('Histogram data not valid zlib: %r', data) failed_entity_transforms.inc() return [] try: data_dict = json.loads(json_str) except json.JSONDecodeError: logging.getLogger().exception('Histogram data not valid json.') failed_entity_transforms.inc() return [] sample_values = data_dict.get('sampleValues', []) if not isinstance(sample_values, list): logging.getLogger().exception( 'Histogram data.sampleValues not valid list.') failed_entity_transforms.inc() return [] count = len(sample_values) sample_values = [v for v in sample_values if v is not None] if len(sample_values) != count: logging.getLogger().warn( 'Histogram data.sampleValues contains null: %r', entity.key) for v in sample_values: if not isinstance(v, (int, float)): logging.getLogger().exception( 'Histogram data.sampleValues contains non-numeric: %r', v) failed_entity_transforms.inc() return [] try: return [{ 'test': entity['test'].name, 'revision': entity['revision'], 'sample_values': sample_values, }] except KeyError: logging.getLogger().exception( 'Histogram missing test or revision field/s') failed_entity_transforms.inc() return [] histogram_query_params = dict(project=project, kind='Histogram') histogram_entities = ( p | 'ReadFromDatastore(Histogram)' >> ReadTimestampRangeFromDatastore( histogram_query_params, time_range_provider=bq_export_options.GetTimeRangeProvider(), step=datetime.timedelta(minutes=5))) histogram_dicts = ( histogram_entities | 'ConvertEntityToDict(Histogram)' >> FlatMap(HistogramEntityToDict)) def TestRevision(element): return (element['test'], element['revision']) rows_with_key = (row_dicts | 'WithKeys(Row)' >> beam.WithKeys(TestRevision)) histograms_with_key = ( histogram_dicts | 'WithKeys(Histogram)' >> beam.WithKeys(TestRevision)) def MergeRowAndSampleValues(element): group_key, join_values = element rows, histograms = join_values if len(rows) == 0: orphaned_histogram.inc() logging.getLogger().error("No Row for Histogram(s) (%r)", group_key) return [] elif len(rows) > 1: row_conflicts.inc() logging.getLogger().error("Multiple rows (%d) for %r", len(rows), group_key) return rows row = rows[0] if len(histograms) > 1: # We'll merge these, so this isn't an error. multiple_histograms_for_row.inc() elif len(histograms) == 0: # No sample values to annotate the row with. This is common. return [row] # Merge multiple histogram's values into a single row. row['sample_values'] = list( itertools.chain.from_iterable(h['sample_values'] for h in histograms)) return [row] joined_and_annotated = ((rows_with_key, histograms_with_key) | beam.CoGroupByKey() | beam.FlatMap(MergeRowAndSampleValues)) def TableNameFn(unused_element): return '{project}:{dataset}.rows{suffix}'.format( project=project, dataset=bq_export_options.dataset.get(), suffix=bq_export_options.table_suffix) _ = (joined_and_annotated | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery( TableNameFn, bq_row_schema, additional_bq_parameters={ 'clustering': { 'fields': ['master', 'bot', 'measurement'] } })) result = p.run() result.wait_until_finish() PrintCounters(result)
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. Returns: PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from generating ExplorationOpportunitySummaryModel. """ topics = ( self.pipeline | 'Get all non-deleted topic models' >> ( ndb_io.GetModels( topic_models.TopicModel.get_all(include_deleted=False))) | 'Get topic from model' >> beam.Map( topic_fetchers.get_topic_from_model) ) story_ids_to_story = ( self.pipeline | 'Get all non-deleted story models' >> ndb_io.GetModels( story_models.StoryModel.get_all(include_deleted=False)) | 'Get story from model' >> beam.Map( story_fetchers.get_story_from_model) | 'Combine stories and ids' >> beam.Map( lambda story: (story.id, story)) ) exp_ids_to_exp = ( self.pipeline | 'Get all non-deleted exp models' >> ndb_io.GetModels( exp_models.ExplorationModel.get_all(include_deleted=False)) | 'Get exploration from model' >> beam.Map( exp_fetchers.get_exploration_from_model) | 'Combine exploration and ids' >> beam.Map( lambda exp: (exp.id, exp)) ) stories_dict = beam.pvalue.AsDict(story_ids_to_story) exps_dict = beam.pvalue.AsDict(exp_ids_to_exp) opportunities_results = ( topics | beam.Map( self._generate_opportunities_related_to_topic, stories_dict=stories_dict, exps_dict=exps_dict) ) unused_put_result = ( opportunities_results | 'Filter the results with SUCCESS status' >> beam.Filter( lambda result: result.is_ok()) | 'Fetch the models to be put' >> beam.FlatMap( lambda result: result.unwrap()) | 'Add ID as a key' >> beam.WithKeys(lambda model: model.id) # pylint: disable=no-value-for-parameter | 'Allow only one item per key' >> ( beam.combiners.Sample.FixedSizePerKey(1)) | 'Remove the IDs' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Flatten the list of lists of models' >> beam.FlatMap(lambda x: x) | 'Put models into the datastore' >> ndb_io.PutModels() ) return ( opportunities_results | 'Count the output' >> ( job_result_transforms.ResultsToJobRunResults()) )
def main(argv=None): def json_parser(x): parsed = json.loads(x) return parsed def bye(x): logging.info('outing: %s', x) return x parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output_topic") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) data = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes) | "JSONParse" >> beam.Map(json_parser)) (data | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"]) | "Windowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))), accumulation_mode=tr.AccumulationMode.DISCARDING, allowed_lateness=0) | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye' >> beam.Map(bye) | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC)) (data | "SlidWindowing" >> beam.WindowInto( window.FixedWindows(60), trigger=(tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))), late=tr.Repeatedly(tr.AfterCount(1)))), allowed_lateness=300, accumulation_mode=tr.AccumulationMode.ACCUMULATING) | "Extract" >> beam.Map(lambda x: x["meter_increment"]) | "Sum_up" >> beam.CombineGlobally(sum).without_defaults() | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x}) | "Enrich with time data" >> beam.ParDo(Enrich()) | "ToBytesCount" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye2' >> beam.Map(bye) | "WriteCount" >> beam.io.WriteToPubSub(TOPIC)) (data | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"]) | "SessionWindowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))), accumulation_mode=tr.AccumulationMode.ACCUMULATING, allowed_lateness=0) | "GroupInPickup" >> beam.CombinePerKey(PickupFn()) | "Discarding Key" >> beam.Map(lambda x: x[1]) | "Filter not pickup" >> beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None) | "ToBytesPickup" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye3' >> beam.Map(bye) | "WritePickup" >> beam.io.WriteToPubSub(TOPIC)) result = p.run() result.wait_until_finish()
def run(self) -> beam.PCollection[job_run_result.JobRunResult]: """Returns a PCollection of results from the story migration. Returns: PCollection. A PCollection of results from the story migration. """ unmigrated_story_models = ( self.pipeline | 'Get all non-deleted story models' >> ( ndb_io.GetModels(story_models.StoryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_model: story_model.id) ) story_summary_models = ( self.pipeline | 'Get all non-deleted story summary models' >> ( ndb_io.GetModels(story_models.StorySummaryModel.get_all())) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add story summary keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda story_summary_model: story_summary_model.id) ) topics = ( self.pipeline | 'Get all non-deleted topic models' >> ( ndb_io.GetModels(topic_models.TopicModel.get_all())) | 'Transform model into domain object' >> beam.Map( topic_fetchers.get_topic_from_model) # Pylint disable is needed because pylint is not able to correctly # detect that the value is passed through the pipe. | 'Add topic keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter lambda topic: topic.id) ) topic_id_to_topic = beam.pvalue.AsDict(topics) migrated_story_results = ( unmigrated_story_models | 'Transform and migrate model' >> beam.MapTuple( self._migrate_story, topic_id_to_topic=topic_id_to_topic) ) migrated_stories = ( migrated_story_results | 'Filter oks' >> beam.Filter( lambda result_item: result_item.is_ok()) | 'Unwrap ok' >> beam.Map( lambda result_item: result_item.unwrap()) ) migrated_story_job_run_results = ( migrated_story_results | 'Generate results for migration' >> ( job_result_transforms.ResultsToJobRunResults('STORY PROCESSED')) ) story_changes = ( unmigrated_story_models | 'Generate story changes' >> beam.FlatMapTuple( self._generate_story_changes) ) story_objects_list = ( { 'story_model': unmigrated_story_models, 'story_summary_model': story_summary_models, 'story': migrated_stories, 'story_change': story_changes } | 'Merge objects' >> beam.CoGroupByKey() | 'Get rid of ID' >> beam.Values() # pylint: disable=no-value-for-parameter | 'Remove unmigrated stories' >> beam.Filter( lambda x: len(x['story_change']) > 0 and len(x['story']) > 0) | 'Reorganize the story objects' >> beam.Map(lambda objects: { 'story_model': objects['story_model'][0], 'story_summary_model': objects['story_summary_model'][0], 'story': objects['story'][0], 'story_change': objects['story_change'][0] }) ) story_objects_list_job_run_results = ( story_objects_list | 'Transform story objects into job run results' >> ( job_result_transforms.CountObjectsToJobRunResult( 'STORY MIGRATED')) ) cache_deletion_job_run_results = ( story_objects_list | 'Delete story from cache' >> beam.Map( lambda story_objects: self._delete_story_from_cache( story_objects['story'])) | 'Generate results for cache deletion' >> ( job_result_transforms.ResultsToJobRunResults('CACHE DELETION')) ) story_models_to_put = ( story_objects_list | 'Generate story models to put' >> beam.FlatMap( lambda story_objects: self._update_story( story_objects['story_model'], story_objects['story'], story_objects['story_change'], )) ) story_summary_models_to_put = ( story_objects_list | 'Generate story summary models to put' >> beam.Map( lambda story_objects: self._update_story_summary( story_objects['story'], story_objects['story_summary_model'] )) ) unused_put_results = ( (story_models_to_put, story_summary_models_to_put) | 'Merge models' >> beam.Flatten() | 'Put models into the datastore' >> ndb_io.PutModels() ) return ( ( cache_deletion_job_run_results, migrated_story_job_run_results, story_objects_list_job_run_results ) | beam.Flatten() )