Ejemplo n.º 1
0
def model_textio_compressed(renames, expected):
  """Using a Read Transform to read compressed text files."""
  p = TestPipeline()

  # [START model_textio_write_compressed]
  lines = p | 'ReadFromText' >> beam.io.ReadFromText(
      '/path/to/input-*.csv.gz',
      compression_type=beam.io.filesystem.CompressionTypes.GZIP)
  # [END model_textio_write_compressed]

  beam.assert_that(lines, beam.equal_to(expected))
  p.visit(SnippetUtils.RenameFiles(renames))
  p.run().wait_until_finish()
Ejemplo n.º 2
0
def construct_pipeline(renames):
  """A reverse words snippet as an example for constructing a pipeline."""
  import re

  class ReverseWords(beam.PTransform):
    """A PTransform that reverses individual elements in a PCollection."""

    def expand(self, pcoll):
      return pcoll | beam.Map(lambda e: e[::-1])

  def filter_words(unused_x):
    """Pass through filter to select everything."""
    return True

  # [START pipelines_constructing_creating]
  from apache_beam.utils.pipeline_options import PipelineOptions

  p = beam.Pipeline(options=PipelineOptions())
  # [END pipelines_constructing_creating]

  p = TestPipeline() # Use TestPipeline for testing.

  # [START pipelines_constructing_reading]
  lines = p | 'ReadMyFile' >> beam.io.ReadFromText('gs://some/inputData.txt')
  # [END pipelines_constructing_reading]

  # [START pipelines_constructing_applying]
  words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
  reversed_words = words | ReverseWords()
  # [END pipelines_constructing_applying]

  # [START pipelines_constructing_writing]
  filtered_words = reversed_words | 'FilterWords' >> beam.Filter(filter_words)
  filtered_words | 'WriteMyFile' >> beam.io.WriteToText(
      'gs://some/outputData.txt')
  # [END pipelines_constructing_writing]

  p.visit(SnippetUtils.RenameFiles(renames))

  # [START pipelines_constructing_running]
  p.run()
Ejemplo n.º 3
0
def examples_wordcount_debugging(renames):
  """DebuggingWordCount example snippets."""
  import re

  import apache_beam as beam

  # [START example_wordcount_debugging_logging]
  # [START example_wordcount_debugging_aggregators]
  import logging

  class FilterTextFn(beam.DoFn):
    """A DoFn that filters for a specific key based on a regular expression."""

    def __init__(self, pattern):
      self.pattern = pattern
      # A custom metric can track values in your pipeline as it runs. Create
      # custom metrics matched_word and unmatched_words.
      self.matched_words = Metrics.counter(self.__class__, 'matched_words')
      self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')

    def process(self, element):
      word, _ = element
      if re.match(self.pattern, word):
        # Log at INFO level each element we match. When executing this pipeline
        # using the Dataflow service, these log lines will appear in the Cloud
        # Logging UI.
        logging.info('Matched %s', word)

        # Add 1 to the custom metric counter matched_words
        self.matched_words.inc()
        yield element
      else:
        # Log at the "DEBUG" level each element that is not matched. Different
        # log levels can be used to control the verbosity of logging providing
        # an effective mechanism to filter less important information. Note
        # currently only "INFO" and higher level logs are emitted to the Cloud
        # Logger. This log message will not be visible in the Cloud Logger.
        logging.debug('Did not match %s', word)

        # Add 1 to the custom metric counter umatched_words
        self.umatched_words.inc()
  # [END example_wordcount_debugging_logging]
  # [END example_wordcount_debugging_aggregators]

  p = TestPipeline()  # Use TestPipeline for testing.
  filtered_words = (
      p
      | beam.io.ReadFromText(
          'gs://dataflow-samples/shakespeare/kinglear.txt')
      | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
      | beam.combiners.Count.PerElement()
      | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

  # [START example_wordcount_debugging_assert]
  beam.assert_that(
      filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)]))
  # [END example_wordcount_debugging_assert]

  output = (filtered_words
            | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
            | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()
Ejemplo n.º 4
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets."""

  import re
  import apache_beam as beam
  from apache_beam.utils.pipeline_options import PipelineOptions

  class WordCountOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='output for the pipeline',
                          default='gs://my-bucket/output')

  class ExtractWordsFn(beam.DoFn):

    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

  class FormatCountsFn(beam.DoFn):

    def process(self, element):
      word, count = element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(beam.PTransform):

    def expand(self, pcoll):
      return (pcoll
              # Convert lines of text into individual words.
              | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
              # Count the number of times each word occurs.
              | beam.combiners.Count.PerElement()
              # Format each word and count into a printable string.
              | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))
  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  p = TestPipeline()  # Use TestPipeline for testing.

  # [START pipeline_monitoring_execution]
  (p
   # Read the lines of the input text.
   | 'ReadLines' >> beam.io.ReadFromText(options.input)
   # Count the words.
   | CountWords()
   # Write the formatted word counts to output.
   | 'WriteCounts' >> beam.io.WriteToText(options.output))
  # [END pipeline_monitoring_execution]

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()
Ejemplo n.º 5
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets."""

  import re
  import apache_beam as beam
  from apache_beam.utils.pipeline_options import PipelineOptions

  class WordCountOptions(PipelineOptions):

    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument('--input',
                          help='Input for the pipeline',
                          default='gs://my-bucket/input')
      parser.add_argument('--output',
                          help='output for the pipeline',
                          default='gs://my-bucket/output')

  class ExtractWordsFn(beam.DoFn):

    def process(self, context):
      words = re.findall(r'[A-Za-z\']+', context.element)
      for word in words:
        yield word

  class FormatCountsFn(beam.DoFn):

    def process(self, context):
      word, count = context.element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(beam.PTransform):

    def expand(self, pcoll):
      return (pcoll
              # Convert lines of text into individual words.
              | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
              # Count the number of times each word occurs.
              | beam.combiners.Count.PerElement()
              # Format each word and count into a printable string.
              | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))
  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  p = TestPipeline()  # Use TestPipeline for testing.

  # [START pipeline_monitoring_execution]
  (p
   # Read the lines of the input text.
   | 'ReadLines' >> beam.io.ReadFromText(options.input)
   # Count the words.
   | CountWords()
   # Write the formatted word counts to output.
   | 'WriteCounts' >> beam.io.WriteToText(options.output))
  # [END pipeline_monitoring_execution]

  p.visit(SnippetUtils.RenameFiles(renames))
  p.run()
Ejemplo n.º 6
0
def examples_wordcount_debugging(renames):
    """DebuggingWordCount example snippets."""
    import re

    import apache_beam as beam

    # [START example_wordcount_debugging_logging]
    # [START example_wordcount_debugging_aggregators]
    import logging

    class FilterTextFn(beam.DoFn):
        """A DoFn that filters for a specific key based on a regular expression."""

        def __init__(self, pattern):
            self.pattern = pattern
            # A custom metric can track values in your pipeline as it runs. Create
            # custom metrics matched_word and unmatched_words.
            self.matched_words = Metrics.counter(self.__class__, 'matched_words')
            self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')

        def process(self, element):
            word, _ = element
            if re.match(self.pattern, word):
                # Log at INFO level each element we match. When executing this pipeline
                # using the Dataflow service, these log lines will appear in the Cloud
                # Logging UI.
                logging.info('Matched %s', word)

                # Add 1 to the custom metric counter matched_words
                self.matched_words.inc()
                yield element
            else:
                # Log at the "DEBUG" level each element that is not matched. Different
                # log levels can be used to control the verbosity of logging providing
                # an effective mechanism to filter less important information. Note
                # currently only "INFO" and higher level logs are emitted to the Cloud
                # Logger. This log message will not be visible in the Cloud Logger.
                logging.debug('Did not match %s', word)

                # Add 1 to the custom metric counter umatched_words
                self.umatched_words.inc()

    # [END example_wordcount_debugging_logging]
    # [END example_wordcount_debugging_aggregators]

    p = TestPipeline()  # Use TestPipeline for testing.
    filtered_words = (
        p
        | beam.io.ReadFromText(
            'gs://dataflow-samples/shakespeare/kinglear.txt')
        | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
        | beam.combiners.Count.PerElement()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

    # [START example_wordcount_debugging_assert]
    beam.assert_that(
        filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)]))
    # [END example_wordcount_debugging_assert]

    output = (filtered_words
              | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
              | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()