Example #1
0
 def expand(self, blog_model_pcoll):
     return (
         blog_model_pcoll
         | 'Discard models with empty property value' >>
         (beam.Filter(lambda model: self.get_property_value(model) != ''))
         | 'Generate (%s, model) key value pairs' % self._property_name >>
         (beam.WithKeys(self.get_property_value))  # pylint: disable=no-value-for-parameter
         | 'Group pairs by their %s' % self._property_name >>
         (beam.GroupByKey())
         | 'Discard %s key' % self._property_name >> beam.Values()  # pylint: disable=no-value-for-parameter
         | 'Discard models with unique %s' % self._property_name >>
         (beam.Filter(lambda models: len(models) > 1)))
Example #2
0
 def expand(self, results):
     """Writes the given job results to the NDB datastore."""
     return (
         results
         # NOTE: Pylint is wrong. WithKeys() is a decorated function with a
         # different signature than the one it's defined with.
         | beam.WithKeys(None)  # pylint: disable=no-value-for-parameter
         # GroupIntoBatches() requires (key, value) pairs as input, so we
         # give everything None keys and then immediately discard them.
         | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL)
         | beam.Values()
         | beam.FlatMap(job_run_result.JobRunResult.accumulate)
         | beam.Map(self.create_beam_job_run_result_model)
         | ndb_io.PutModels(self.datastoreio_stub))
Example #3
0
def run(options: pipeline_options.PipelineOptions):

    p = beam.Pipeline(options=options)

    # Read in the CSV file
    input_data = read_data(
        p,
        options.view_as(CovidTrackingPipelineOptions).input_file)

    # Analyze the data: Find columns are present in every row, and columns that
    # aren't.
    column_information = beam.pvalue.AsSingleton(
        input_data
        | covidpipe.datasource.FindEmptyAndNonEmptyColumns())

    # Get columns from our rows, and also ensure we get the main columns of
    # interest.
    full_data = select_wanted_columns(input_data, column_information,
                                      ['positive', 'negative'])

    # Filter out data points without 'positive' field. This is because they are
    # not valuable for our analysis.
    filtered_data = full_data | 'FilterMissingPositive' >> beam.Filter(
        lambda x: 'positive' in x)

    # For each state, let's get an iterable of
    per_state_iterables = (filtered_data
                           | beam.WithKeys(lambda x: x['state'])
                           | beam.GroupByKey()
                           | beam.Values())

    # Find 7-day spikes per state
    state_spikes = (per_state_iterables | beam.ParDo(FindStateSpikesFn()))

    # Write spikes to an output
    (state_spikes
     | beam.Map(json.dumps)
     | beam.io.WriteToText(
         options.view_as(CovidTrackingPipelineOptions).spikes_output_file))

    result = p.run()
Example #4
0
def run(
    input_subscription: str,
    output_table: str,
    window_interval_sec: int = 60,
    beam_args: List[str] = None,
) -> None:
    """Build and run the pipeline."""
    options = PipelineOptions(beam_args,
                              save_main_session=True,
                              streaming=True)

    with beam.Pipeline(options=options) as pipeline:
        messages = (
            pipeline
            | "Read from Pub/Sub" >> beam.io.ReadFromPubSub(
                subscription=input_subscription).with_output_types(bytes)
            | "UTF-8 bytes to string" >>
            beam.Map(lambda msg: msg.decode("utf-8"))
            | "Parse JSON messages" >> beam.Map(parse_json_message)
            | "Fixed-size windows" >> beam.WindowInto(
                window.FixedWindows(window_interval_sec, 0))
            | "Add URL keys" >> beam.WithKeys(lambda msg: msg["url"])
            | "Group by URLs" >> beam.GroupByKey()
            | "Get statistics" >> beam.MapTuple(
                lambda url, messages: {
                    "url": url,
                    "num_reviews": len(messages),
                    "score": sum(msg["score"]
                                 for msg in messages) / len(messages),
                    "first_date": min(msg["processing_time"]
                                      for msg in messages),
                    "last_date": max(msg["processing_time"]
                                     for msg in messages),
                }))

        # Output the results into BigQuery table.
        _ = messages | "Write to Big Query" >> beam.io.WriteToBigQuery(
            output_table, schema=SCHEMA)
Example #5
0
    def expand(self, entities):
        """Writes the given job results to the NDB datastore.

        This overrides expand from parent class.

        Args:
            entities: PCollection. Models, can also contain just one model.

        Returns:
            PCollection. An empty PCollection.
        """
        return (
            entities
            # NOTE: Pylint is wrong. WithKeys() is a decorated function with a
            # different signature than the one it's defined with.
            | beam.WithKeys(None)  # pylint: disable=no-value-for-parameter
            # GroupIntoBatches() requires (key, value) pairs as input, so we
            # give everything None keys and then immediately discard them.
            | beam.GroupIntoBatches(self._MAX_RESULT_INSTANCES_PER_MODEL)
            | beam.Values()
            | beam.FlatMap(job_run_result.JobRunResult.accumulate)
            | beam.Map(self.create_beam_job_run_result_model,
                       entities.pipeline.options.namespace)
            | ndb_io.PutModels())
Example #6
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of results from the skill migration.

        Returns:
            PCollection. A PCollection of results from the skill migration.
        """
        unmigrated_skill_models = (
            self.pipeline
            | 'Get all non-deleted skill models' >>
            (ndb_io.GetModels(skill_models.SkillModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add skill model ID' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda skill_model: skill_model.id))
        skill_summary_models = (
            self.pipeline
            | 'Get all non-deleted skill summary models' >>
            (ndb_io.GetModels(skill_models.SkillSummaryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add skill summary ID' >> beam.WithKeys(  # pylint: disable=no-value-for-parameter
                lambda skill_summary_model: skill_summary_model.id))

        migrated_skill_results = (unmigrated_skill_models
                                  | 'Transform and migrate model' >>
                                  beam.MapTuple(self._migrate_skill))
        migrated_skills = (
            migrated_skill_results
            | 'Filter oks' >>
            beam.Filter(lambda result_item: result_item.is_ok())
            |
            'Unwrap ok' >> beam.Map(lambda result_item: result_item.unwrap()))
        migrated_skill_job_run_results = (
            migrated_skill_results
            | 'Generate results for migration' >>
            (job_result_transforms.ResultsToJobRunResults('SKILL PROCESSED')))

        skill_changes = (unmigrated_skill_models
                         | 'Generate skill changes' >> beam.FlatMapTuple(
                             self._generate_skill_changes))

        skill_objects_list = (
            {
                'skill_model': unmigrated_skill_models,
                'skill_summary_model': skill_summary_models,
                'skill': migrated_skills,
                'skill_changes': skill_changes
            }
            | 'Merge objects' >> beam.CoGroupByKey()
            | 'Get rid of ID' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Remove unmigrated skills' >> beam.Filter(
                lambda x: len(x['skill_changes']) > 0 and len(x['skill']) > 0)
            | 'Reorganize the skill objects' >> beam.Map(
                lambda objects: {
                    'skill_model': objects['skill_model'][0],
                    'skill_summary_model': objects['skill_summary_model'][0],
                    'skill': objects['skill'][0],
                    'skill_changes': objects['skill_changes']
                }))

        skill_objects_list_job_run_results = (
            skill_objects_list
            | 'Transform skill objects into job run results' >>
            (job_result_transforms.CountObjectsToJobRunResult('SKILL MIGRATED')
             ))

        cache_deletion_job_run_results = (
            skill_objects_list
            | 'Delete skill from cache' >>
            beam.Map(lambda skill_object: self._delete_skill_from_cache(
                skill_object['skill']))
            | 'Generate results for cache deletion' >>
            (job_result_transforms.ResultsToJobRunResults('CACHE DELETION')))

        skill_models_to_put = (
            skill_objects_list
            | 'Generate skill models to put' >>
            beam.FlatMap(lambda skill_objects: self._update_skill(
                skill_objects['skill_model'],
                skill_objects['skill'],
                skill_objects['skill_changes'],
            )))

        skill_summary_models_to_put = (
            skill_objects_list
            | 'Generate skill summary models to put' >>
            beam.Map(lambda skill_objects: self._update_skill_summary(
                skill_objects['skill'], skill_objects['skill_summary_model'])))

        unused_put_results = (
            (skill_models_to_put, skill_summary_models_to_put)
            | 'Merge models' >> beam.Flatten()
            | 'Put models into the datastore' >> ndb_io.PutModels())

        return (
            (cache_deletion_job_run_results, migrated_skill_job_run_results,
             skill_objects_list_job_run_results)
            | beam.Flatten())
Example #7
0
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

# beam-playground:
#   name: WithKeys
#   description: Task from katas to convert each fruit name into a KV of its first letter and itself.
#   multifile: false
#   categories:
#     - Combiners

import apache_beam as beam

from log_elements import LogElements

with beam.Pipeline() as p:

    (p | beam.Create(['apple', 'banana', 'cherry', 'durian', 'guava', 'melon'])
     | beam.WithKeys(lambda word: word[0:1])
     | LogElements())
Example #8
0
def run(
    project: str,
    region: str,
    cloud_storage_path: str,
    bigquery_dataset: str,
    bigquery_table: str,
    ai_platform_name_prefix: str,
    min_images_per_class: int,
    max_images_per_class: int,
    budget_milli_node_hours: int,
    pipeline_options: Optional[PipelineOptions] = None,
) -> None:
    """Creates a balanced dataset and signals AI Platform to train a model.

    Args:
        project: Google Cloud Project ID.
        region: Location for AI Platform resources.
        bigquery_dataset: Dataset ID for the images database, the dataset must exist.
        bigquery_table: Table ID for the images database, the table must exist.
        ai_platform_name_prefix: Name prefix for AI Platform resources.
        min_images_per_class: Minimum number of images required per class for training.
        max_images_per_class: Maximum number of images allowed per class for training.
        budget_milli_node_hours: Training budget.
        pipeline_options: PipelineOptions for Apache Beam.

    """
    with beam.Pipeline(options=pipeline_options) as pipeline:
        images = (
            pipeline
            | "Read images info" >> beam.io.ReadFromBigQuery(
                dataset=bigquery_dataset, table=bigquery_table)
            | "Key by category" >> beam.WithKeys(lambda x: x["category"])
            | "Random samples" >>
            beam.combiners.Sample.FixedSizePerKey(max_images_per_class)
            | "Remove key" >> beam.Values()
            | "Discard small samples" >>
            beam.Filter(lambda sample: len(sample) >= min_images_per_class)
            | "Flatten elements" >> beam.FlatMap(lambda sample: sample)
            | "Get image" >> beam.FlatMap(get_image, cloud_storage_path))

        dataset_csv_filename = f"{cloud_storage_path}/dataset.csv"
        dataset_csv_file = (
            pipeline
            | "Dataset filename" >> beam.Create([dataset_csv_filename])
            | "Write dataset file" >> beam.Map(
                write_dataset_csv_file, images=beam.pvalue.AsIter(images)))

        if ai_platform_name_prefix:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            (dataset_csv_file
             | "Create dataset" >> beam.Map(
                 create_dataset,
                 project=project,
                 region=region,
                 dataset_name=f"{ai_platform_name_prefix}_{timestamp}",
             )
             | "Import images" >> beam.MapTuple(import_images_to_dataset)
             | "Train model" >> beam.Map(
                 train_model,
                 project=project,
                 region=region,
                 model_name=f"{ai_platform_name_prefix}_{timestamp}",
                 budget_milli_node_hours=budget_milli_node_hours,
             ))
Example #9
0
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')
    row_conflicts = Metrics.counter('main', 'row_conflicts')
    multiple_histograms_for_row = Metrics.counter(
        'main', 'multiple_histograms_for_row')
    orphaned_histogram = Metrics.counter('main', 'orphaned_histogram')

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_rows.<MASTER>`
  (revision INT64 NOT NULL,
   value FLOAT64 NOT NULL,
   std_error FLOAT64,
   `timestamp` TIMESTAMP NOT NULL,
   master STRING NOT NULL,
   bot STRING NOT NULL,
   measurement STRING,
   test STRING NOT NULL,
   properties STRING,
   sample_values ARRAY<FLOAT64>)
  PARTITION BY DATE(`timestamp`)
  CLUSTER BY master, bot, measurement;
  """  # pylint: disable=pointless-string-statement
    bq_row_schema = {
        'fields': [
            {
                'name': 'revision',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'value',
                'type': 'FLOAT',
                'mode': 'REQUIRED'
            },
            {
                'name': 'std_error',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'timestamp',
                'type': 'TIMESTAMP',
                'mode': 'REQUIRED'
            },
            {
                'name': 'master',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'test',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'properties',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'sample_values',
                'type': 'FLOAT',
                'mode': 'REPEATED'
            },
        ]
    }

    def RowEntityToRowDict(entity):
        entities_read.inc()
        try:
            d = {
                'revision': entity.key.id,
                'value': FloatHack(entity['value']),
                'std_error': FloatHack(entity.get('error')),
                'timestamp': entity['timestamp'].isoformat(),
                'test': entity.key.parent.name,
            }
            # Add the expando properties as a JSON-encoded dict.
            properties = {}
            for key, value in entity.items():
                if key in d or key in ['parent_test', 'error']:
                    # skip properties with dedicated columns.
                    continue
                if isinstance(value, float):
                    value = FloatHack(value)
                properties[key] = value
            d['properties'] = json.dumps(properties) if properties else None
            # Add columns derived from test: master, bot.
            test_path_parts = d['test'].split('/', 2)
            if len(test_path_parts) >= 3:
                d['master'] = test_path_parts[0]
                d['bot'] = test_path_parts[1]
                d['measurement'] = '/'.join(test_path_parts[2:])
            return [d]
        except KeyError:
            logging.getLogger().exception('Failed to convert Row')
            failed_entity_transforms.inc()
            return []

    row_query_params = dict(project=project, kind='Row')
    row_entities = (
        p
        | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore(
            row_query_params,
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            step=datetime.timedelta(minutes=5)))

    row_dicts = (row_entities
                 | 'ConvertEntityToDict(Row)' >> FlatMap(RowEntityToRowDict))

    # The sample_values are not found in the Row entity.  So we have to fetch all
    # the corresponding Histogram entities and join them with our collection of
    # Rows (by using test + revision as the join key).  We also need to unpack the
    # sample values arrays out of the zlib-compressed JSON stored in the
    # Histogram's "data" property.
    def HistogramEntityToDict(entity):
        """Returns dicts with keys: 'test', 'revision', 'sample_values'."""
        entities_read.inc()
        try:
            data = entity['data']
        except KeyError:
            logging.getLogger().exception('Histogram missing "data" field')
            failed_entity_transforms.inc()
            return []
        try:
            json_str = zlib.decompress(data)
        except zlib.error:
            logging.getLogger().exception('Histogram data not valid zlib: %r',
                                          data)
            failed_entity_transforms.inc()
            return []
        try:
            data_dict = json.loads(json_str)
        except json.JSONDecodeError:
            logging.getLogger().exception('Histogram data not valid json.')
            failed_entity_transforms.inc()
            return []
        sample_values = data_dict.get('sampleValues', [])
        if not isinstance(sample_values, list):
            logging.getLogger().exception(
                'Histogram data.sampleValues not valid list.')
            failed_entity_transforms.inc()
            return []
        count = len(sample_values)
        sample_values = [v for v in sample_values if v is not None]
        if len(sample_values) != count:
            logging.getLogger().warn(
                'Histogram data.sampleValues contains null: %r', entity.key)
        for v in sample_values:
            if not isinstance(v, (int, float)):
                logging.getLogger().exception(
                    'Histogram data.sampleValues contains non-numeric: %r', v)
                failed_entity_transforms.inc()
                return []
        try:
            return [{
                'test': entity['test'].name,
                'revision': entity['revision'],
                'sample_values': sample_values,
            }]
        except KeyError:
            logging.getLogger().exception(
                'Histogram missing test or revision field/s')
            failed_entity_transforms.inc()
            return []

    histogram_query_params = dict(project=project, kind='Histogram')
    histogram_entities = (
        p
        | 'ReadFromDatastore(Histogram)' >> ReadTimestampRangeFromDatastore(
            histogram_query_params,
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            step=datetime.timedelta(minutes=5)))

    histogram_dicts = (
        histogram_entities
        | 'ConvertEntityToDict(Histogram)' >> FlatMap(HistogramEntityToDict))

    def TestRevision(element):
        return (element['test'], element['revision'])

    rows_with_key = (row_dicts
                     | 'WithKeys(Row)' >> beam.WithKeys(TestRevision))
    histograms_with_key = (
        histogram_dicts | 'WithKeys(Histogram)' >> beam.WithKeys(TestRevision))

    def MergeRowAndSampleValues(element):
        group_key, join_values = element
        rows, histograms = join_values
        if len(rows) == 0:
            orphaned_histogram.inc()
            logging.getLogger().error("No Row for Histogram(s) (%r)",
                                      group_key)
            return []
        elif len(rows) > 1:
            row_conflicts.inc()
            logging.getLogger().error("Multiple rows (%d) for %r", len(rows),
                                      group_key)
            return rows
        row = rows[0]
        if len(histograms) > 1:
            # We'll merge these, so this isn't an error.
            multiple_histograms_for_row.inc()
        elif len(histograms) == 0:
            # No sample values to annotate the row with.  This is common.
            return [row]
        # Merge multiple histogram's values into a single row.
        row['sample_values'] = list(
            itertools.chain.from_iterable(h['sample_values']
                                          for h in histograms))
        return [row]

    joined_and_annotated = ((rows_with_key, histograms_with_key)
                            | beam.CoGroupByKey()
                            | beam.FlatMap(MergeRowAndSampleValues))

    def TableNameFn(unused_element):
        return '{project}:{dataset}.rows{suffix}'.format(
            project=project,
            dataset=bq_export_options.dataset.get(),
            suffix=bq_export_options.table_suffix)

    _ = (joined_and_annotated
         | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery(
             TableNameFn,
             bq_row_schema,
             additional_bq_parameters={
                 'clustering': {
                     'fields': ['master', 'bot', 'measurement']
                 }
             }))

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        generating ExplorationOpportunitySummaryModel.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            generating ExplorationOpportunitySummaryModel.
        """

        topics = (
            self.pipeline
            | 'Get all non-deleted topic models' >> (
                ndb_io.GetModels(
                    topic_models.TopicModel.get_all(include_deleted=False)))
            | 'Get topic from model' >> beam.Map(
                topic_fetchers.get_topic_from_model)
        )

        story_ids_to_story = (
            self.pipeline
            | 'Get all non-deleted story models' >> ndb_io.GetModels(
                story_models.StoryModel.get_all(include_deleted=False))
            | 'Get story from model' >> beam.Map(
                story_fetchers.get_story_from_model)
            | 'Combine stories and ids' >> beam.Map(
                lambda story: (story.id, story))
        )

        exp_ids_to_exp = (
            self.pipeline
            | 'Get all non-deleted exp models' >> ndb_io.GetModels(
                exp_models.ExplorationModel.get_all(include_deleted=False))
            | 'Get exploration from model' >> beam.Map(
                exp_fetchers.get_exploration_from_model)
            | 'Combine exploration and ids' >> beam.Map(
                lambda exp: (exp.id, exp))
        )

        stories_dict = beam.pvalue.AsDict(story_ids_to_story)
        exps_dict = beam.pvalue.AsDict(exp_ids_to_exp)

        opportunities_results = (
            topics
            | beam.Map(
                self._generate_opportunities_related_to_topic,
                stories_dict=stories_dict,
                exps_dict=exps_dict)
        )

        unused_put_result = (
            opportunities_results
            | 'Filter the results with SUCCESS status' >> beam.Filter(
                lambda result: result.is_ok())
            | 'Fetch the models to be put' >> beam.FlatMap(
                lambda result: result.unwrap())
            | 'Add ID as a key' >> beam.WithKeys(lambda model: model.id)  # pylint: disable=no-value-for-parameter
            | 'Allow only one item per key' >> (
                beam.combiners.Sample.FixedSizePerKey(1))
            | 'Remove the IDs' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Flatten the list of lists of models' >> beam.FlatMap(lambda x: x)
            | 'Put models into the datastore' >> ndb_io.PutModels()
        )

        return (
            opportunities_results
            | 'Count the output' >> (
                job_result_transforms.ResultsToJobRunResults())
        )
def main(argv=None):
    def json_parser(x):
        parsed = json.loads(x)
        return parsed

    def bye(x):
        logging.info('outing: %s', x)
        return x

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_topic")
    parser.add_argument("--output_topic")
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    data = (p
            | 'ReadData' >>
            beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes)
            | "JSONParse" >> beam.Map(json_parser))

    (data
     | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"])
     | "Windowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))),
         accumulation_mode=tr.AccumulationMode.DISCARDING,
         allowed_lateness=0)
     | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye' >> beam.Map(bye)
     | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "SlidWindowing" >> beam.WindowInto(
         window.FixedWindows(60),
         trigger=(tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))),
                                    late=tr.Repeatedly(tr.AfterCount(1)))),
         allowed_lateness=300,
         accumulation_mode=tr.AccumulationMode.ACCUMULATING)
     | "Extract" >> beam.Map(lambda x: x["meter_increment"])
     | "Sum_up" >> beam.CombineGlobally(sum).without_defaults()
     | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x})
     | "Enrich with time data" >> beam.ParDo(Enrich())
     | "ToBytesCount" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye2' >> beam.Map(bye)
     | "WriteCount" >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"])
     | "SessionWindowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))),
         accumulation_mode=tr.AccumulationMode.ACCUMULATING,
         allowed_lateness=0)
     | "GroupInPickup" >> beam.CombinePerKey(PickupFn())
     | "Discarding Key" >> beam.Map(lambda x: x[1])
     | "Filter not pickup" >>
     beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None)
     | "ToBytesPickup" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye3' >> beam.Map(bye)
     | "WritePickup" >> beam.io.WriteToPubSub(TOPIC))

    result = p.run()
    result.wait_until_finish()
Example #12
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of results from the story migration.

        Returns:
            PCollection. A PCollection of results from the story migration.
        """

        unmigrated_story_models = (
            self.pipeline
            | 'Get all non-deleted story models' >> (
                ndb_io.GetModels(story_models.StoryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda story_model: story_model.id)
        )
        story_summary_models = (
            self.pipeline
            | 'Get all non-deleted story summary models' >> (
                ndb_io.GetModels(story_models.StorySummaryModel.get_all()))
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add story summary keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda story_summary_model: story_summary_model.id)
        )
        topics = (
            self.pipeline
            | 'Get all non-deleted topic models' >> (
                ndb_io.GetModels(topic_models.TopicModel.get_all()))
            | 'Transform model into domain object' >> beam.Map(
                topic_fetchers.get_topic_from_model)
            # Pylint disable is needed because pylint is not able to correctly
            # detect that the value is passed through the pipe.
            | 'Add topic keys' >> beam.WithKeys( # pylint: disable=no-value-for-parameter
                lambda topic: topic.id)
        )
        topic_id_to_topic = beam.pvalue.AsDict(topics)

        migrated_story_results = (
            unmigrated_story_models
            | 'Transform and migrate model' >> beam.MapTuple(
                self._migrate_story, topic_id_to_topic=topic_id_to_topic)
        )
        migrated_stories = (
            migrated_story_results
            | 'Filter oks' >> beam.Filter(
                lambda result_item: result_item.is_ok())
            | 'Unwrap ok' >> beam.Map(
                lambda result_item: result_item.unwrap())
        )
        migrated_story_job_run_results = (
            migrated_story_results
            | 'Generate results for migration' >> (
                job_result_transforms.ResultsToJobRunResults('STORY PROCESSED'))
        )

        story_changes = (
            unmigrated_story_models
            | 'Generate story changes' >> beam.FlatMapTuple(
                self._generate_story_changes)
        )

        story_objects_list = (
            {
                'story_model': unmigrated_story_models,
                'story_summary_model': story_summary_models,
                'story': migrated_stories,
                'story_change': story_changes
            }
            | 'Merge objects' >> beam.CoGroupByKey()
            | 'Get rid of ID' >> beam.Values()  # pylint: disable=no-value-for-parameter
            | 'Remove unmigrated stories' >> beam.Filter(
                lambda x: len(x['story_change']) > 0 and len(x['story']) > 0)
            | 'Reorganize the story objects' >> beam.Map(lambda objects: {
                    'story_model': objects['story_model'][0],
                    'story_summary_model': objects['story_summary_model'][0],
                    'story': objects['story'][0],
                    'story_change': objects['story_change'][0]
                })
        )

        story_objects_list_job_run_results = (
            story_objects_list
            | 'Transform story objects into job run results' >> (
                job_result_transforms.CountObjectsToJobRunResult(
                    'STORY MIGRATED'))
        )

        cache_deletion_job_run_results = (
            story_objects_list
            | 'Delete story from cache' >> beam.Map(
                lambda story_objects: self._delete_story_from_cache(
                    story_objects['story']))
            | 'Generate results for cache deletion' >> (
                job_result_transforms.ResultsToJobRunResults('CACHE DELETION'))
        )

        story_models_to_put = (
            story_objects_list
            | 'Generate story models to put' >> beam.FlatMap(
                lambda story_objects: self._update_story(
                    story_objects['story_model'],
                    story_objects['story'],
                    story_objects['story_change'],
                ))
        )

        story_summary_models_to_put = (
            story_objects_list
            | 'Generate story summary models to put' >> beam.Map(
                lambda story_objects: self._update_story_summary(
                    story_objects['story'],
                    story_objects['story_summary_model']
                ))
        )

        unused_put_results = (
            (story_models_to_put, story_summary_models_to_put)
            | 'Merge models' >> beam.Flatten()
            | 'Put models into the datastore' >> ndb_io.PutModels()
        )

        return (
            (
                cache_deletion_job_run_results,
                migrated_story_job_run_results,
                story_objects_list_job_run_results
            )
            | beam.Flatten()
        )