Example #1
0
  def test_basic(self):
    examples = [
        text_format.Parse(
            """
              features {
                feature { key: "x" value { float_list { value: 0 }}}
              }
              """, tf.train.Example()),
        text_format.Parse(
            """
              features {
                feature { key: "x" value { float_list { value: 1 }}}
              }
              """, tf.train.Example())
    ]
    model_paths = [self._get_output_data_dir(m) for m in ('model1', 'model2')]
    for model_path in model_paths:
      self._build_predict_model(model_path)
    specs = [
        model_spec_pb2.InferenceSpecType(
            saved_model_spec=model_spec_pb2.SavedModelSpec(model_path=p))
        for p in model_paths
    ]
    with self._make_beam_pipeline() as pipeline:
      predictions = (
          pipeline
          | beam.Create(examples)
          | run_inference.RunInferencePerModelImpl(specs)
          | beam.MapTuple(
              lambda _, p2: p2.predict_log.response.outputs['y'].float_val[0]))
      assert_that(predictions, equal_to([0.0, 2.0]))

      predictions_table = (
          pipeline
          |
          'CreateTable' >> beam.Create([(i, e) for i, e in enumerate(examples)])
          | 'RunInferencePerModelTable' >>
          run_inference.RunInferencePerModelImpl(specs)
          | beam.MapTuple(lambda k, v:  # pylint: disable=g-long-lambda
                          (k, v[1].predict_log.response.outputs['y'].float_val[
                              0])))
      assert_that(
          predictions_table,
          equal_to([(0, 0.0), (1, 2.0)]),
          label='AssertTable')
Example #2
0
def run(
    scenes: List[str],
    output_path_prefix: str,
    vis_params: Dict[str, Any],
    beam_args: Optional[List[str]] = None,
) -> None:
    """Load multiple Landsat scenes and render them as JPEG files.

    Args:
        scenes: List of Landsat 8 scene IDs.
        output_path_prefix: Path prefix to save the output files.
        vis_params: Visualization parameters including {rgb_bands, min, max, gamma}.
        beam_args: Optional list of arguments for Beam pipeline options.
    """
    rgb_band_names = vis_params["rgb_band_names"]
    min_value = vis_params["min"]
    max_value = vis_params["max"]
    gamma = vis_params["gamma"]

    beam_options = PipelineOptions(beam_args, save_main_session=True)
    pipeline = beam.Pipeline(options=beam_options)
    (
        pipeline
        | "Create scene IDs" >> beam.Create(scenes)
        | "Check GPU availability"
        >> beam.Map(
            lambda x, unused_side_input: x,
            unused_side_input=beam.pvalue.AsSingleton(
                pipeline | beam.Create([None]) | beam.Map(check_gpus)
            ),
        )
        | "Get RGB band paths" >> beam.Map(get_band_paths, rgb_band_names)
        | "Load RGB band values" >> beam.MapTuple(load_values)
        | "Preprocess pixels"
        >> beam.MapTuple(preprocess_pixels, min_value, max_value, gamma)
        | "Convert to image"
        >> beam.MapTuple(
            lambda scene, rgb_pixels: (
                scene,
                Image.fromarray(rgb_pixels.numpy(), mode="RGB"),
            )
        )
        | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs, output_path_prefix)
    )
    pipeline.run()
Example #3
0
 def run_ex(self, pipeline, context, words_to_count, words_to_filter):
     self.pipeline = pipeline
     self.context = context
     return (pipeline
             | 'LoadingWordsInput' >> beam.Create(words_to_count)
             | 'FilterWords' >> beam.Filter(lambda w: w in words_to_filter)
             | 'MapToCount' >> beam.Map(lambda w: (w, 1))
             | 'GroupWords' >> beam.GroupByKey()
             | 'Count' >> beam.MapTuple(self.count))
Example #4
0
 def expand(self, pcoll):
     return (
         pcoll
         | 'Add Timestamps' >>
         beam.Map(lambda x: beam.window.TimestampedValue(x, time.time()))
         | "Window into Fixed Intervals" >> beam.WindowInto(
             window.FixedWindows(self.window_size))
         | "Groupby" >> beam.GroupByKey()
         | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
Example #5
0
    def test_map_tuple(self):
        # TODO(https://github.com/apache/beam/issues/19961): Also test with a fn
        # that accepts default arguments.
        def tuple_map_fn(a: str, b: str, c: str) -> str:
            return a + b + c

        th = beam.MapTuple(tuple_map_fn).get_type_hints()
        self.assertEqual(th.input_types, ((str, str, str), {}))
        self.assertEqual(th.output_types, ((str, ), {}))
Example #6
0
def examples_wordcount_minimal(renames):
  """MinimalWordCount example snippets."""
  import re

  import apache_beam as beam

  from apache_beam.options.pipeline_options import GoogleCloudOptions
  from apache_beam.options.pipeline_options import StandardOptions
  from apache_beam.options.pipeline_options import PipelineOptions

  # [START examples_wordcount_minimal_options]
  options = PipelineOptions()
  google_cloud_options = options.view_as(GoogleCloudOptions)
  google_cloud_options.project = 'my-project-id'
  google_cloud_options.job_name = 'myjob'
  google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
  google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
  options.view_as(StandardOptions).runner = 'DataflowRunner'
  # [END examples_wordcount_minimal_options]

  # Run it locally for testing.
  options = PipelineOptions()

  # [START examples_wordcount_minimal_create]
  p = beam.Pipeline(options=options)
  # [END examples_wordcount_minimal_create]

  (
      # [START examples_wordcount_minimal_read]
      p
      | beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
      # [END examples_wordcount_minimal_read]

      # [START examples_wordcount_minimal_pardo]
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      # [END examples_wordcount_minimal_pardo]

      # [START examples_wordcount_minimal_count]
      | beam.combiners.Count.PerElement()
      # [END examples_wordcount_minimal_count]

      # [START examples_wordcount_minimal_map]
      | beam.MapTuple(lambda word, count: '%s: %s' % (word, count))
      # [END examples_wordcount_minimal_map]

      # [START examples_wordcount_minimal_write]
      | beam.io.WriteToText('gs://my-bucket/counts.txt')
      # [END examples_wordcount_minimal_write]
  )

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START examples_wordcount_minimal_run]
  result = p.run()
  # [END examples_wordcount_minimal_run]
  result.wait_until_finish()
Example #7
0
 def expand(self, pcoll):
     return (
         pcoll
         # Assigns window info to each Pub/Sub message based on its publish timestamp.
         | "Window into Fixed Intervals" >> beam.WindowInto(window.FixedWindows(self.window_size))
         # If the windowed elements do not fit into memory please consider using `beam.util.BatchElements`.
         | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
         | "Groupby" >> beam.GroupByKey()
         | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
     )
 def expand(self, pcoll):
     if has_any_weight:
         return pcoll | beam.CombinePerKey(_sum_pairwise)
     else:
         # For non-weighted case, use sum combine fn over integers to allow
         # Beam to use Cython combiner.
         return (pcoll
                 | 'RemoveWeights' >> beam.MapTuple(lambda k, v:
                                                    (k, v[0]))
                 | beam.CombinePerKey(sum))
Example #9
0
    def expand(self, inputs):
        def _encode_values(k, v):
            return (k,
                    tf.compat.as_str_any(','.join(map(tf.compat.as_str_any,
                                                      v))))

        pcoll, = inputs
        return (pcoll
                | 'EncodeValues' >> beam.MapTuple(_encode_values)
                | 'SwapKeysAndValues' >> beam.KvSwap())
Example #10
0
    def run(self) -> beam.PCollection[job_run_result.JobRunResult]:
        """Returns a PCollection of 'SUCCESS' or 'FAILURE' results from
        the Elastic Search.

        Returns:
            PCollection. A PCollection of 'SUCCESS' or 'FAILURE' results from
            the Elastic Search.
        """

        exp_summary_models = (
            self.pipeline
            | 'Get all non-deleted models' >> (
                ndb_io.GetModels(exp_models.ExpSummaryModel.get_all()))
        )

        exp_summary_iter = beam.pvalue.AsIter(exp_summary_models)

        exp_recommendations_models = (
            exp_summary_models
            | 'Compute similarity' >> beam.ParDo(
                ComputeSimilarity(), exp_summary_iter)
            | 'Group similarities per exploration ID' >> beam.GroupByKey()
            | 'Sort and slice similarities' >> beam.MapTuple(
                lambda exp_id, similarities: (
                    exp_id, self._sort_and_slice_similarities(similarities)))
            | 'Create recommendation models' >> beam.MapTuple(
                self._create_recommendation)
        )

        unused_put_result = (
            exp_recommendations_models
            | 'Put models into the datastore' >> ndb_io.PutModels()
        )

        return (
            exp_recommendations_models
            | 'Count all new models' >> beam.combiners.Count.Globally()
            | 'Only create result for new models when > 0' >> (
                beam.Filter(lambda x: x > 0))
            | 'Create result for new models' >> beam.Map(
                lambda x: job_run_result.JobRunResult(
                    stdout='SUCCESS %s' % x))
        )
    def expand(
        self, sliced_record_batchs_and_ys: Tuple[
            beam.PCollection[types.SlicedRecordBatch],
            beam.PCollection[_SlicedYKey]]
    ) -> beam.PCollection[Tuple[_SlicedYKey, _ConditionalYRate]]:
        sliced_record_batchs, y_keys = sliced_record_batchs_and_ys

        # _SlicedXYKey(slice, x_path, x, y), xy_count
        partial_copresence_counts = (
            sliced_record_batchs
            | 'ToPartialCopresenceCounts' >> beam.FlatMap(
                _to_partial_copresence_counts, self._y_path, self._x_paths,
                self._y_boundaries, self._example_weight_map,
                self._num_xy_pairs_batch_copresent))

        # Compute placeholder copresence counts.
        # partial_copresence_counts will only include x-y pairs that are present,
        # but we would also like to keep track of x-y pairs that never appear, as
        # long as x and y independently occur in the slice.

        # _SlicedXKey(slice, x_path, x), x_count
        x_counts = (
            sliced_record_batchs
            | 'ToPartialXCounts' >> beam.FlatMap(
                _to_partial_x_counts, self._x_paths, self._example_weight_map)
            | 'SumXCounts' >> beam.CombinePerKey(sum))
        if self._min_x_count:
            x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
                lambda kv: kv[1] > self._min_x_count)

        # _SlicedXYKey(slice, x_path, x, y), 0
        placeholder_copresence_counts = (
            (x_counts, y_keys)
            | 'GetPlaceholderCopresenceCounts' >>
            _GetPlaceholderCopresenceCounts(self._x_paths, self._min_x_count))

        def move_y_to_value(key, xy_count):
            return _SlicedXKey(key.slice_key, key.x_path,
                               key.x), (key.y, xy_count)

        # _SlicedXKey(slice, x_path, x), (y, xy_count)
        copresence_counts = (
            (placeholder_copresence_counts, partial_copresence_counts)
            | 'FlattenCopresenceCounts' >> beam.Flatten()
            | 'SumCopresencePairs' >> beam.CombinePerKey(sum)
            | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

        # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
        return (copresence_counts
                | 'JoinXCounts' >> beam.ParDo(
                    _LookupInnerJoinDoFn(),
                    right_iterable=beam.pvalue.AsIter(x_counts))
                | 'MakeConditionalYRates' >> beam.Map(
                    _make_conditional_y_rates,
                    num_xy_pairs_distinct=self._num_xy_pairs_distinct))
Example #12
0
    def test_map_tuple(self):
        def f(a, b, y=None):
            return a, b, y

        expected = [(1, 2), (3, 4)] | beam.MapTuple(f, y=5)
        actual = [(1, 2), (3, 4)] | threadmap.ThreadMapTuple(f, y=5)
        self.assertEqual(expected, actual)

        actual = [(1, 2),
                  (3, 4)] | threadmap.ThreadMapTuple(f, y=5, num_threads=None)
        self.assertEqual(expected, actual)
Example #13
0
def run(beam_options):
    suffixes = get_suffixes()
    print(f"Processing {len(suffixes)} files")

    with beam.Pipeline(options=beam_options) as p:
        (p
         | beam.Create(suffixes)
         | "Download Data" >> beam.ParDo(download_subtile)
         | "Coarsen" >> beam.MapTuple(coarsen)
         | common.CombineSubtilesByKey()
         | common.WriteToNetCDFs(_name))
Example #14
0
 def expand(self, pcoll):
     return (
         pcoll
         | "Start" >> beam.FlatMap(_start_stage, self.specs_by_target)
         | "CreateTasks" >> beam.FlatMapTuple(_copy_tasks)
         # prevent undesirable fusion
         # https://stackoverflow.com/a/54131856/809705
         | "Reshuffle" >> beam.Reshuffle()
         | "CopyChunks" >> beam.MapTuple(_copy_chunk)
         # prepare inputs for the next stage (if any)
         | "Finish" >> beam.Distinct())
Example #15
0
 def test_timestamped_value(self):
   with TestPipeline() as p:
     result = (p
               | 'start' >> Create([(k, k) for k in range(10)])
               | Map(lambda x_t: TimestampedValue(x_t[0], x_t[1]))
               | 'w' >> WindowInto(FixedWindows(5))
               | Map(lambda v: ('key', v))
               | GroupByKey()
               | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
     assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                   ('key', [5, 6, 7, 8, 9])]))
    def expand(self,
               sliced_record_batchs_and_ys: Tuple[types.SlicedRecordBatch,
                                                  _SlicedYKey]):
        sliced_record_batchs, y_keys = sliced_record_batchs_and_ys

        # _SlicedXYKey(slice, x_path, x, y), xy_count
        partial_copresence_counts = (
            sliced_record_batchs
            | 'ToPartialCopresenceCounts' >> beam.FlatMap(
                _to_partial_copresence_counts, self._y_path, self._x_paths,
                self._y_boundaries, self._weight_column_name,
                self._num_xy_pairs_batch_copresent))

        # Compute placeholder copresence counts.
        # partial_copresence_counts will only include x-y pairs that are present,
        # but we would also like to keep track of x-y pairs that never appear, as
        # long as x and y independently occur in the slice.

        # _SlicedXKey(slice, x_path, x), x_count
        x_counts = (
            sliced_record_batchs
            | 'ToPartialXCounts' >> beam.FlatMap(
                _to_partial_x_counts, self._x_paths, self._weight_column_name)
            | 'SumXCounts' >> beam.CombinePerKey(sum))
        if self._min_x_count:
            x_counts = x_counts | 'FilterXCounts' >> beam.Filter(
                lambda kv: kv[1] > self._min_x_count)

        # _SlicedXYKey(slice, x_path, x, y), 0
        placeholder_copresence_counts = (
            (x_counts, y_keys)
            | 'GetPlaceholderCopresenceCounts' >>
            _GetPlaceholderCopresenceCounts(self._x_paths, self._min_x_count))

        def move_y_to_value(key, xy_count):
            return _SlicedXKey(key.slice_key, key.x_path,
                               key.x), (key.y, xy_count)

        # _SlicedXKey(slice, x_path, x), (y, xy_count)
        copresence_counts = (
            (placeholder_copresence_counts, partial_copresence_counts)
            | 'FlattenCopresenceCounts' >> beam.Flatten()
            | 'SumCopresencePairs' >> beam.CombinePerKey(sum)
            | 'MoveYToValue' >> beam.MapTuple(move_y_to_value))

        # _SlicedYKey(slice, y), _ConditionalYRate(x_path, x, xy_count, x_count)
        return ({
            'x_count': x_counts,
            'xy_counts': copresence_counts
        }
                | 'CoGroupByForConditionalYRates' >> beam.CoGroupByKey()
                | 'JoinXCounts' >>
                beam.FlatMap(_join_x_counts, self._num_xy_pairs_distinct,
                             self._num_x_values_distinct))
Example #17
0
def run(
    scenes: List[str],
    output_path_prefix: str,
    vis_params: Dict[str, Any],
    gpus_optional: bool,
    beam_args: Optional[List[str]] = None,
) -> None:
    """Load multiple Landsat scenes and render them as JPEG files.

    Args:
        scenes: List of Landsat 8 scene IDs.
        output_path_prefix: Path prefix to save the output files.
        vis_params: Visualization parameters including {rgb_bands, min, max, gamma}.
        gpus_optional: If True, the pipeline won't crash if GPUs are not found.
        beam_args: Optional list of arguments for Beam pipeline options.
    """
    rgb_band_names = vis_params["rgb_band_names"]
    min_value = vis_params["min"]
    max_value = vis_params["max"]
    gamma = vis_params["gamma"]

    options = PipelineOptions(beam_args, save_main_session=True)
    with beam.Pipeline(options=options) as pipeline:
        # Optionally, validate that the workers are using GPUs.
        (pipeline
         | beam.Create([None])
         | "Check GPU availability" >> beam.Map(check_gpus, gpus_optional))

        # Convert Landsat 8 scenes into images.
        (pipeline
         | "Create scene IDs" >> beam.Create(scenes)
         | "Get RGB band paths" >> beam.Map(get_band_paths, rgb_band_names)
         | "Load RGB band values" >> beam.MapTuple(load_values)
         | "Preprocess pixels" >> beam.MapTuple(preprocess_pixels, min_value,
                                                max_value, gamma)
         | "Convert to image" >> beam.MapTuple(lambda scene, rgb_pixels: (
             scene,
             Image.fromarray(rgb_pixels.numpy(), mode="RGB"),
         ))
         | "Save to Cloud Storage" >> beam.MapTuple(save_to_gcs,
                                                    output_path_prefix))
Example #18
0
 def expand(self, input_or_inputs):
     return (
         input_or_inputs
         | "Window into fixed timespan" >> beam.WindowInto(
             window.FixedWindows(self.window_size))
         # add timestamps, in this case, read it directly from the message
         | "Add timestamps to messages" >> beam.ParDo(ExposeMsgTimestamp())
         # dummy key is used for aggregation purpose, i.e. every self.window_size we can assign a unique key
         | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
         | "Groupby" >> beam.GroupByKey()
         | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
         | "Aggregate HLOC" >> beam.ParDo(AggregateData()))
Example #19
0
 def expand(self, pcoll):
     return (
         pcoll
         # Assigns window info to each Pub/Sub message based on its
         # publish timestamp.
         | "Window into Sessions"
         #>> beam.WindowInto(window.Sessions(self.gap_size))
         >> beam.WindowInto(window.SlidingWindows(10, 5))
         | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
         | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
         | "Groupby" >> beam.GroupByKey()
         | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
Example #20
0
    def define(self, pipeline: beam.Pipeline) -> None:
        def format_result(word, count):
            return self.output_format % (word, count)

        _ = (pipeline
             | 'Read' >> self.words_source
             | 'Split' >>
             (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
             | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
             | 'GroupAndSum' >> beam.CombinePerKey(sum)
             | 'Format' >> beam.MapTuple(format_result)
             | 'Write' >> self.word_count_sink)
Example #21
0
def WritePandasToCSV(
    pcoll: PCollection[pd.DataFrame],
    file_path_prefix: str,
    **kwargs,
):
    return (pcoll
            | "Convert DFs to tuples" >>
            beam.FlatMap(lambda df: map(list, df.values))
            | "Convert to csv lines" >>
            beam.MapTuple(lambda *args: ",".join(map(str, args)))
            | "Write results to csv" >> beam_io.WriteToText(
                file_path_prefix=file_path_prefix, **kwargs))
Example #22
0
 def test_sliding_windows(self):
     with TestPipeline() as p:
         pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
         result = (pcoll
                   | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
                   | GroupByKey()
                   | beam.MapTuple(lambda k, vs: (k, sorted(vs)))
                   | reify_windows)
         expected = [('key @ [-2.0, 2.0)', [1]),
                     ('key @ [0.0, 4.0)', [1, 2, 3]),
                     ('key @ [2.0, 6.0)', [2, 3])]
         assert_that(result, equal_to(expected))
Example #23
0
def run(argv=None):
    """Build and run the pipeline"""
    parser = argparse.ArgumentParser()
    parser.add_argument("--topic", type=str, help='Pub/Sub topic to read from')
    parser.add_argument("--output_bucket", help=('Output local filemane'))
    parser.add_argument('--output_bigquery',
                        default='IoTData.engine',
                        help=('Output BigQuery table: '
                              'PROJECT:DATASET.TABLE '
                              'or DATASET.TABLE.'))
    parser.add_argument('--output_bigquery_avg',
                        default='DeviceData.engine_avr',
                        help=('Output BigQuery table for averages: '
                              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = beam.Pipeline(options=options)
    pubsub_stream = (
        p | 'Read from PubSub' >> beam.io.ReadFromPubSub(topic=args.topic))
    records = (pubsub_stream
               | 'Parse JSON to Dict' >> beam.Map(lambda e: json.loads(e))
               | 'Add timestamp' >> beam.ParDo(AddTimestampToDict()))

    # stream to BigQuery
    (records | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        args.output_bigquery,
        schema=Schema.get_bigquery_schema(),
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    # averages
    (records | "Window for avg" >> beam.WindowInto(window.FixedWindows(60))
     | 'Add deviceId Key' >> beam.ParDo(AddKeyToDict())
     | 'Group by Key' >> beam.GroupByKey()
     | 'Count average' >> beam.ParDo(CountAverages())
     | 'Write Avg to BigQuery' >> beam.io.WriteToBigQuery(
         args.output_bigquery_avg,
         schema=Schema.get_bigquery_avg_schema(),
         create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=BigQueryDisposition.WRITE_APPEND))

    (records | "Window for bucket" >> beam.WindowInto(window.FixedWindows(60))
     | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
     | "Group by Dummy Key" >> beam.GroupByKey()
     | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
     | "Write to GCS" >> beam.ParDo(WriteBatchesToGCS(args.output_bucket)))

    result = p.run()
    result.wait_until_finish()
Example #24
0
def run(argv=None):
    pipeline_options = PipelineOptions(argv)
    options = pipeline_options.view_as(ParkdataPipelineOptions)
    # Save the main session that defines global import, functions and variables. Otherwise they are not saved during
    # the serialization. Details see https://cloud.google.com/dataflow/docs/resources/faq#how_do_i_handle_nameerrors
    pipeline_options.view_as(
        SetupOptions).save_main_session = options.save_session
    with beam.Pipeline(options=pipeline_options) as p:
        wikidata_data, commons_ids = (
            p
            | "wikidata_query/create" >> beam.Create(wd_queries())
            | "wikidata/query" >> wikidata.Query(
                FileSystems.join(options.base_path,
                                 "wikidata_query_cache.sqlite"),
                user_agent=options.user_agent,
            )
            | "wikidata/group" >> beam.GroupByKey()
            | "wikidata/fetch" >> wikidata.Transform(
                options.supported_languages(),
                cache_file=FileSystems.join(options.base_path,
                                            "wikidata_cache.sqlite"),
                user_agent=options.user_agent,
            ))

        commons_data = commons_ids | "commons" >> commons.Transform(
            FileSystems.join(options.base_path, "commons_cache.sqlite"),
            user_agent=options.user_agent)

        wikipedia_data = wikidata_data | "wikipedia" >> wikipedia.Transform(
            FileSystems.join(options.base_path, "wikipedia_qache.sqlite"),
            user_agent=options.user_agent)

        changed_places = (
            {
                Combine.TAG_COMMONS: commons_data,
                Combine.TAG_WIKIDATA: wikidata_data,
                Combine.TAG_WIKIPEDIA: wikipedia_data,
            }
            | "combine/group_by_key" >> beam.CoGroupByKey()
            | "combine/combine" >> beam.ParDo(Combine())
            | "combine/changed" >> beam.ParDo(
                OutputNewOrChangedEntires(
                    FileSystems.join(options.base_path, "output.sqlite"))))

        (changed_places
         |
         "firestore_output/convert_types" >> beam.MapTuple(use_firestore_types)
         | "firestore_output/write" >> beam.ParDo(
             FirestoreWrite(project=options.project_id,
                            collection="places_v4",
                            credentials="gcp-service-account.json")))
def run(options):
    with beam.Pipeline(options=options) as p:
        lines = (p | 'Read test xml file' >> beam.io.ReadFromText(
            p.options.input, validate=False))

        counts = (lines
                  | 'Split' >>
                  (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
                  | 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        output = counts | 'Format' >> beam.MapTuple(format_result)

        output | 'Write' >> beam.io.WriteToText(p.options.output)
Example #26
0
            def expand(self, pcolls):

                scalar_inputs = [
                    expr for expr in self.stage.inputs if is_scalar(expr)
                ]
                tabular_inputs = [
                    expr for expr in self.stage.inputs if not is_scalar(expr)
                ]

                if len(tabular_inputs) == 0:
                    partitioned_pcoll = next(
                        pcolls.values()).pipeline | beam.Create([{}])

                elif self.stage.partitioning != partitionings.Nothing():
                    # Arrange such that partitioned_pcoll is properly partitioned.
                    main_pcolls = {
                        expr._id: pcolls[expr._id] | 'Flat%s' % expr._id >>
                        beam.FlatMap(self.stage.partitioning.partition_fn)
                        for expr in tabular_inputs
                    } | beam.CoGroupByKey()
                    partitioned_pcoll = main_pcolls | beam.MapTuple(
                        lambda _, inputs:
                        {tag: pd.concat(vs)
                         for tag, vs in inputs.items()})

                else:
                    # Already partitioned, or no partitioning needed.
                    assert len(tabular_inputs) == 1
                    tag = tabular_inputs[0]._id
                    partitioned_pcoll = pcolls[tag] | beam.Map(
                        lambda df: {tag: df})

                side_pcolls = {
                    expr._id: beam.pvalue.AsSingleton(pcolls[expr._id])
                    for expr in scalar_inputs
                }

                # Actually evaluate the expressions.
                def evaluate(partition, stage=self.stage, **side_inputs):
                    session = expressions.Session(
                        dict([(expr, partition[expr._id])
                              for expr in tabular_inputs] +
                             [(expr, side_inputs[expr._id])
                              for expr in scalar_inputs]))
                    for expr in stage.outputs:
                        yield beam.pvalue.TaggedOutput(
                            expr._id, expr.evaluate_at(session))

                return partitioned_pcoll | beam.FlatMap(
                    evaluate, **side_pcolls).with_outputs()
Example #27
0
  def test_reshuffle_after_gbk_contents_unchanged(self):
    with TestPipeline() as pipeline:
      data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
      expected_result = [(1, [1, 2, 3]), (2, [1, 2]), (3, [1])]

      after_gbk = (
          pipeline
          | beam.Create(data)
          | beam.GroupByKey()
          | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
      assert_that(after_gbk, equal_to(expected_result), label='after_gbk')
      after_reshuffle = after_gbk | beam.Reshuffle()
      assert_that(
          after_reshuffle, equal_to(expected_result), label='after_reshuffle')
Example #28
0
 def test_rewindow(self):
   with TestPipeline() as p:
     result = (p
               | Create([(k, k) for k in range(10)])
               | Map(lambda x_t1: TimestampedValue(x_t1[0], x_t1[1]))
               | 'window' >> WindowInto(SlidingWindows(period=2, size=6))
               # Per the model, each element is now duplicated across
               # three windows. Rewindowing must preserve this duplication.
               | 'rewindow' >> WindowInto(FixedWindows(5))
               | 'rewindow2' >> WindowInto(FixedWindows(5))
               | Map(lambda v: ('key', v))
               | GroupByKey()
               | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
     assert_that(result, equal_to([('key', sorted([0, 1, 2, 3, 4] * 3)),
                                   ('key', sorted([5, 6, 7, 8, 9] * 3))]))
Example #29
0
        def CheckAggregation(inputs_and_expected, aggregation):
            # Split the test stream into a branch of to-be-processed elements, and
            # a branch of expected results.
            inputs, expected = (
                inputs_and_expected
                | beam.FlatMapTuple(lambda tag, value: [
                    beam.pvalue.TaggedOutput(tag, ('key1', value)),
                    beam.pvalue.TaggedOutput(tag, ('key2', value)),
                ]).with_outputs('input', 'expect'))

            # Process the inputs with the given windowing to produce actual outputs.
            outputs = (
                inputs
                | beam.MapTuple(lambda key, value: TimestampedValue(
                    (key, value), value))
                | beam.WindowInto(window_fn,
                                  trigger=trigger_fn,
                                  accumulation_mode=accumulation_mode,
                                  timestamp_combiner=timestamp_combiner)
                | aggregation
                | beam.MapTuple(_windowed_value_info_map_fn)
                # Place outputs back into the global window to allow flattening
                # and share a single state in Check.
                | 'Global' >> beam.WindowInto(
                    beam.transforms.window.GlobalWindows()))
            # Feed both the expected and actual outputs to Check() for comparison.
            tagged_expected = (
                expected
                | beam.MapTuple(lambda key, value: (key, ('expect', value))))
            tagged_outputs = (
                outputs
                | beam.MapTuple(lambda key, value: (key, ('actual', value))))
            # pylint: disable=expression-not-assigned
            ([tagged_expected, tagged_outputs]
             | beam.Flatten()
             | beam.ParDo(Check(self.allow_out_of_order)))
Example #30
0
 def test_reshuffle_global_window(self):
   with TestPipeline() as pipeline:
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
     before_reshuffle = (
         pipeline
         | beam.Create(data)
         | beam.WindowInto(GlobalWindows())
         | beam.GroupByKey()
         | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
     assert_that(
         before_reshuffle, equal_to(expected_data), label='before_reshuffle')
     after_reshuffle = before_reshuffle | beam.Reshuffle()
     assert_that(
         after_reshuffle, equal_to(expected_data), label='after reshuffle')