def model_textio_compressed(renames, expected): """Using a Read Transform to read compressed text files.""" p = TestPipeline() # [START model_textio_write_compressed] lines = p | 'ReadFromText' >> beam.io.ReadFromText( '/path/to/input-*.csv.gz', compression_type=beam.io.filesystem.CompressionTypes.GZIP) # [END model_textio_write_compressed] beam.assert_that(lines, beam.equal_to(expected)) p.visit(SnippetUtils.RenameFiles(renames)) p.run().wait_until_finish()
def construct_pipeline(renames): """A reverse words snippet as an example for constructing a pipeline.""" import re class ReverseWords(beam.PTransform): """A PTransform that reverses individual elements in a PCollection.""" def expand(self, pcoll): return pcoll | beam.Map(lambda e: e[::-1]) def filter_words(unused_x): """Pass through filter to select everything.""" return True # [START pipelines_constructing_creating] from apache_beam.utils.pipeline_options import PipelineOptions p = beam.Pipeline(options=PipelineOptions()) # [END pipelines_constructing_creating] p = TestPipeline() # Use TestPipeline for testing. # [START pipelines_constructing_reading] lines = p | 'ReadMyFile' >> beam.io.ReadFromText('gs://some/inputData.txt') # [END pipelines_constructing_reading] # [START pipelines_constructing_applying] words = lines | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) reversed_words = words | ReverseWords() # [END pipelines_constructing_applying] # [START pipelines_constructing_writing] filtered_words = reversed_words | 'FilterWords' >> beam.Filter(filter_words) filtered_words | 'WriteMyFile' >> beam.io.WriteToText( 'gs://some/outputData.txt') # [END pipelines_constructing_writing] p.visit(SnippetUtils.RenameFiles(renames)) # [START pipelines_constructing_running] p.run()
def examples_wordcount_debugging(renames): """DebuggingWordCount example snippets.""" import re import apache_beam as beam # [START example_wordcount_debugging_logging] # [START example_wordcount_debugging_aggregators] import logging class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') def process(self, element): word, _ = element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom metric counter matched_words self.matched_words.inc() yield element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom metric counter umatched_words self.umatched_words.inc() # [END example_wordcount_debugging_logging] # [END example_wordcount_debugging_aggregators] p = TestPipeline() # Use TestPipeline for testing. filtered_words = ( p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.combiners.Count.PerElement() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # [START example_wordcount_debugging_assert] beam.assert_that( filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # [END example_wordcount_debugging_assert] output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = TestPipeline() # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, context): words = re.findall(r'[A-Za-z\']+', context.element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, context): word, count = context.element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) p = TestPipeline() # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames)) p.run()
def examples_wordcount_debugging(renames): """DebuggingWordCount example snippets.""" import re import apache_beam as beam # [START example_wordcount_debugging_logging] # [START example_wordcount_debugging_aggregators] import logging class FilterTextFn(beam.DoFn): """A DoFn that filters for a specific key based on a regular expression.""" def __init__(self, pattern): self.pattern = pattern # A custom metric can track values in your pipeline as it runs. Create # custom metrics matched_word and unmatched_words. self.matched_words = Metrics.counter(self.__class__, 'matched_words') self.umatched_words = Metrics.counter(self.__class__, 'umatched_words') def process(self, element): word, _ = element if re.match(self.pattern, word): # Log at INFO level each element we match. When executing this pipeline # using the Dataflow service, these log lines will appear in the Cloud # Logging UI. logging.info('Matched %s', word) # Add 1 to the custom metric counter matched_words self.matched_words.inc() yield element else: # Log at the "DEBUG" level each element that is not matched. Different # log levels can be used to control the verbosity of logging providing # an effective mechanism to filter less important information. Note # currently only "INFO" and higher level logs are emitted to the Cloud # Logger. This log message will not be visible in the Cloud Logger. logging.debug('Did not match %s', word) # Add 1 to the custom metric counter umatched_words self.umatched_words.inc() # [END example_wordcount_debugging_logging] # [END example_wordcount_debugging_aggregators] p = TestPipeline() # Use TestPipeline for testing. filtered_words = ( p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.combiners.Count.PerElement() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # [START example_wordcount_debugging_assert] beam.assert_that( filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # [END example_wordcount_debugging_assert] output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt')) p.visit(SnippetUtils.RenameFiles(renames)) p.run()