def model_pcollection(argv): """Creating a PCollection from data in local memory.""" from apache_beam.utils.pipeline_options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) # [START model_pcollection] p = beam.Pipeline(options=pipeline_options) (p | beam.Create([ 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, ']) | beam.io.WriteToText(my_options.output)) result = p.run() # [END model_pcollection] result.wait_until_finish()
def test_with_requirements_file(self): try: staging_dir = tempfile.mkdtemp() requirements_cache_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_cache = requirements_cache_dir options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( sorted([dependency.REQUIREMENTS_FILE, 'abc.txt', 'def.txt']), sorted(dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache))) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt'))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt'))) finally: shutil.rmtree(staging_dir) shutil.rmtree(requirements_cache_dir) shutil.rmtree(source_dir)
def test_with_setup_file(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [dependency.WORKFLOW_TARBALL_FILE], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def run(argv=None): """Runs the Wikipedia top edits pipeline. Args: argv: Pipeline options as a list of arguments. """ parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/wikipedia_edits/*.json', help='Input specified as a GCS path containing a BigQuery table exported ' 'as json.') parser.add_argument('--output', required=True, help='Output file to write results to.') parser.add_argument('--sampling_threshold', type=float, default=0.1, help='Fraction of entries used for session tracking') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) | ComputeTopSessions(known_args.sampling_threshold) | WriteToText(known_args.output)) p.run()
def run(): parser = argparse.ArgumentParser() parser.add_argument('--run_locally', dest='run_locally', default='', help='Run data subset and do not save.') known_args, pipeline_args = parser.parse_known_args() pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) delete_from_datastore('dancedeets-hrd', gcloud_options, known_args.run_locally)
def test_default_resources(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) self.assertEqual( [], dependency.stage_job_resources(options))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Actually run the pipeline (all operations above are deferred). p.run().wait_until_finish()
def examples_wordcount_minimal(renames): """MinimalWordCount example snippets.""" import re import apache_beam as beam from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import StandardOptions from apache_beam.utils.pipeline_options import PipelineOptions # [START examples_wordcount_minimal_options] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # [END examples_wordcount_minimal_options] # Run it locally for testing. options = PipelineOptions() # [START examples_wordcount_minimal_create] p = beam.Pipeline(options=options) # [END examples_wordcount_minimal_create] ( # [START examples_wordcount_minimal_read] p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') # [END examples_wordcount_minimal_read] # [START examples_wordcount_minimal_pardo] | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) # [END examples_wordcount_minimal_pardo] # [START examples_wordcount_minimal_count] | beam.combiners.Count.PerElement() # [END examples_wordcount_minimal_count] # [START examples_wordcount_minimal_map] | beam.Map(lambda (word, count): '%s: %s' % (word, count)) # [END examples_wordcount_minimal_map] # [START examples_wordcount_minimal_write] | beam.io.WriteToText('gs://my-bucket/counts.txt') # [END examples_wordcount_minimal_write] ) p.visit(SnippetUtils.RenameFiles(renames)) # [START examples_wordcount_minimal_run] result = p.run() # [END examples_wordcount_minimal_run] result.wait_until_finish()
def test_extra_package(self): options = PipelineOptions(['--extra_package', 'abc', '--extra_packages', 'def', '--extra_packages', 'ghi']) self.assertEqual( sorted(options.get_all_options()['extra_packages']), ['abc', 'def', 'ghi']) options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['extra_packages'], None)
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_option_with_space(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() # The default maps to two large Google Cloud Storage files (each ~12GB) # holding two subsequent day's worth (roughly) of data. parser.add_argument('--input', dest='input', default='gs://dataflow-samples/game/gaming_data*.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--dataset', dest='dataset', required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument('--table_name', dest='table_name', default='hourly_team_score', help='The BigQuery table name. Should not already exist.') parser.add_argument('--window_duration', type=int, default=60, help='Numeric value of fixed window duration, in minutes') parser.add_argument('--start_min', dest='start_min', default='1970-01-01-00-00', help='String representation of the first minute after ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'prior to that minute won\'t be included in the ' 'sums.') parser.add_argument('--stop_min', dest='stop_min', default='2100-01-01-00-00', help='String representation of the first minute for ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'after to that minute won\'t be included in the ' 'sums.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) pipeline_options.view_as(SetupOptions).save_main_session = True (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) | HourlyTeamScore( known_args.start_min, known_args.stop_min, known_args.window_duration) | WriteWindowedToBigQuery( known_args.table_name, known_args.dataset, configure_bigquery_write())) result = p.run() result.wait_until_finish()
def test_no_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual( [], dependency.stage_job_resources(options))
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary(all_options_dict) self.assertEqual(options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_no_temp_location(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.staging_location = staging_dir self.update_options(options) google_cloud_options.temp_location = None with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual('The --temp_location option must be specified.', cm.exception.message)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--kind', dest='kind', required=True, help='Datastore Kind') parser.add_argument('--namespace', dest='namespace', help='Datastore Namespace') parser.add_argument('--ancestor', dest='ancestor', default='root', help='The ancestor key name for all entities.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--read_only', action='store_true', help='Read an existing dataset, do not write first') parser.add_argument('--num_shards', dest='num_shards', type=int, # If the system should choose automatically. default=0, help='Number of output shards') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) # Write to Datastore if `read_only` options is not specified. if not known_args.read_only: write_to_datastore(gcloud_options.project, known_args, pipeline_options) # Read entities from Datastore. result = read_from_datastore(gcloud_options.project, known_args, pipeline_options) empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def run(argv=None): # pylint: disable=expression-not-assigned parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file pattern to process.') parser.add_argument('--output', required=True, help='Output file pattern to write results to.') parser.add_argument('--checksum_output', help='Checksum output file pattern.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input, coder=beam.coders.BytesCoder()) # Count the occurrences of each word. output = (lines | 'split' >> beam.Map( lambda x: (x[:10], x[10:99])) .with_output_types(beam.typehints.KV[str, str]) | 'group' >> beam.GroupByKey() | 'format' >> beam.FlatMap( lambda (key, vals): ['%s%s' % (key, val) for val in vals])) # Write the output using a "Write" transform that has side effects. output | WriteToText(known_args.output) # Optionally write the input and output checksums. if known_args.checksum_output: input_csum = (lines | 'input-csum' >> beam.Map(crc32line) | 'combine-input-csum' >> beam.CombineGlobally(sum) | 'hex-format' >> beam.Map(lambda x: '%x' % x)) input_csum | 'write-input-csum' >> WriteToText( known_args.checksum_output + '-input') output_csum = (output | 'output-csum' >> beam.Map(crc32line) | 'combine-output-csum' >> beam.CombineGlobally(sum) | 'hex-format-output' >> beam.Map(lambda x: '%x' % x)) output_csum | 'write-output-csum' >> WriteToText( known_args.checksum_output + '-output') # Actually run the pipeline (all operations above are deferred). return p.run()
def run(argv=None): """Runs the workflow counting the long words and short words separately.""" parser = argparse.ArgumentParser() parser.add_argument('--input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', required=True, help='Output prefix for files to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) lines = p | ReadFromText(known_args.input) # with_outputs allows accessing the side outputs of a DoFn. split_lines_result = (lines | beam.ParDo(SplitLinesToWordsFn()).with_outputs( SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS, SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT, main='words')) # split_lines_result is an object of type DoOutputsTuple. It supports # accessing result in alternative ways. words, _, _ = split_lines_result short_words = split_lines_result[ SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS] character_count = split_lines_result.tag_character_count # pylint: disable=expression-not-assigned (character_count | 'pair_with_key' >> beam.Map(lambda x: ('chars_temp_key', x)) | beam.GroupByKey() | 'count chars' >> beam.Map(lambda (_, counts): sum(counts)) | 'write chars' >> WriteToText(known_args.output + '-chars')) # pylint: disable=expression-not-assigned (short_words | 'count short words' >> CountWords() | 'write short words' >> WriteToText( known_args.output + '-short-words')) # pylint: disable=expression-not-assigned (words | 'count words' >> CountWords() | 'write words' >> WriteToText(known_args.output + '-words')) return p.run()
def test_with_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = True self.update_options(options) self.assertEqual([names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_requirements_file_not_present(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def test_sdk_location_gcs(self): staging_dir = tempfile.mkdtemp() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary( all_options_dict) self.assertEqual( options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual( options.view_as(PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz'] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def test_with_main_session(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = True self.update_options(options) self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_setup_file_not_present(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = 'nosuchfile' with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % 'nosuchfile')
def test_experiments(self): options = PipelineOptions(['--experiment', 'abc', '--experiment', 'def']) self.assertEqual( sorted(options.get_all_options()['experiments']), ['abc', 'def']) options = PipelineOptions(['--experiments', 'abc', '--experiments', 'def']) self.assertEqual( sorted(options.get_all_options()['experiments']), ['abc', 'def']) options = PipelineOptions(flags=['']) self.assertEqual(options.get_all_options()['experiments'], None)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--kind', dest='kind', required=True, help='Datastore Kind') parser.add_argument('--namespace', dest='namespace', help='Datastore Namespace') parser.add_argument('--ancestor', dest='ancestor', default='root', help='The ancestor key name for all entities.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--read_only', action='store_true', help='Read an existing dataset, do not write first') parser.add_argument('--num_shards', dest='num_shards', type=int, # If the system should choose automatically. default=0, help='Number of output shards') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) # Write to Datastore if `read_only` options is not specified. if not known_args.read_only: write_to_datastore(gcloud_options.project, known_args, pipeline_options) # Read entities from Datastore. result = read_from_datastore(gcloud_options.project, known_args, pipeline_options) result.metrics().query()
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.utils.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.utils.pipeline_options import GoogleCloudOptions from apache_beam.utils.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" class WordcountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_value_provider_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_value_provider_argument( '--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) wordcount_options = pipeline_options.view_as(WordcountOptions) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(wordcount_options.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(wordcount_options.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed)
def test_sdk_location_local_not_present(self): staging_dir = tempfile.mkdtemp() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def test_redefine_options(self): class TestRedefinedOptios(PipelineOptions): # pylint: disable=unused-variable @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') class TestRedefinedOptios(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') options = PipelineOptions(['--redefined_flag']) self.assertTrue(options.get_all_options()['redefined_flag'])
def test_full_completion(self): # Create dummy file and close it. Note that we need to do this because # Windows does not allow NamedTemporaryFiles to be reopened elsewhere # before the temporary file is closed. dummy_file = tempfile.NamedTemporaryFile(delete=False) dummy_file_name = dummy_file.name dummy_file.close() dummy_dir = tempfile.mkdtemp() remote_runner = DataflowRunner() pipeline = Pipeline( remote_runner, options=PipelineOptions([ '--dataflow_endpoint=ignored', '--sdk_location=' + dummy_file_name, '--job_name=test-job', '--project=test-project', '--staging_location=' + dummy_dir, '--temp_location=/dev/null', '--template_location=' + dummy_file_name, '--no_auth=True' ])) pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned pipeline.run().wait_until_finish() with open(dummy_file_name) as template_file: saved_job_dict = json.load(template_file) self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions']['options'] ['project'], 'test-project') self.assertEqual( saved_job_dict['environment']['sdkPipelineOptions']['options'] ['job_name'], 'test-job')
def test_dataflow_job_file_and_template_location_mutually_exclusive(self): runner = MockRunners.OtherRunner() options = PipelineOptions( ['--template_location', 'abc', '--dataflow_job_file', 'def']) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertTrue(errors)
def run(in_args=None): """Runs the pre-processing pipeline.""" pipeline_options = PipelineOptions.from_dictionary(vars(in_args)) p = beam.Pipeline(options=pipeline_options) configure_pipeline(p, in_args) p.run()
def test_streaming(self): pipeline_options = PipelineOptions(['--streaming']) runner = MockRunners.TestDataflowRunner() validator = PipelineOptionsValidator(pipeline_options, runner) errors = validator.validate() self.assertIn('Streaming pipelines are not supported.', errors)
def test_redefine_options(self): class TestRedefinedOptios(PipelineOptions): # pylint: disable=unused-variable @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') class TestRedefinedOptios(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--redefined_flag', action='store_true') options = PipelineOptions(['--redefined_flag']) self.assertEqual(options.get_all_options()['redefined_flag'], True)
def run(argv=None): """Main entry point; defines and runs the user_score pipeline.""" parser = argparse.ArgumentParser() # The default maps to two large Google Cloud Storage files (each ~12GB) # holding two subsequent day's worth (roughly) of data. parser.add_argument('--input', dest='input', default='gs://dataflow-samples/game/gaming_data*.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--dataset', dest='dataset', required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument( '--table_name', dest='table_name', default='user_score', help='The BigQuery table name. Should not already exist.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) (p # pylint: disable=expression-not-assigned | ReadFromText( known_args.input) # Read events from a file and parse them. | UserScore() | WriteToBigQuery(known_args.table_name, known_args.dataset, configure_bigquery_write())) result = p.run() result.wait_until_finish()
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz')] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_package option expects a full path ending with ".tar" or ' '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz') ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_package option expects a full path ending with ".tar" or ' '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = 'pypi' expected_from_path = self.override_pypi_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_with_extra_packages_missing_files(self): staging_dir = tempfile.mkdtemp() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ 'nosuchfile.tar.gz' ] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def __init__(self, runner=None, options=None, argv=None, is_integration_test=False, blocking=True): """Initialize a pipeline object for test. Args: runner: An object of type 'PipelineRunner' that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options: A configured 'PipelineOptions' object containing arguments that should be used for running the pipeline job. argv: A list of arguments (such as sys.argv) to be used for building a 'PipelineOptions' object. This will only be used if argument 'options' is None. is_integration_test: True if the test is an integration test, False otherwise. blocking: Run method will wait until pipeline execution is completed. Raises: ValueError: if either the runner or options argument is not of the expected type. """ self.is_integration_test = is_integration_test self.options_list = self._parse_test_option_args(argv) self.blocking = blocking if options is None: options = PipelineOptions(self.options_list) super(TestPipeline, self).__init__(runner, options)
def model_textio(renames): """Using a Read and Write transform to read/write text files.""" def filter_words(x): import re return re.findall(r'[A-Za-z\']+', x) import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions # [START model_textio_read] p = beam.Pipeline(options=PipelineOptions()) # [START model_pipelineio_read] lines = p | 'ReadFromText' >> beam.io.ReadFromText('path/to/input-*.csv') # [END model_pipelineio_read] # [END model_textio_read] # [START model_textio_write] filtered_words = lines | 'FilterWords' >> beam.FlatMap(filter_words) # [START model_pipelineio_write] filtered_words | 'WriteToText' >> beam.io.WriteToText( '/path/to/numbers', file_name_suffix='.csv') # [END model_pipelineio_write] # [END model_textio_write] p.visit(SnippetUtils.RenameFiles(renames)) p.run().wait_until_finish()
def test_table_schema_without_project(self): # Writer should pick executing project by default. sink = beam.io.BigQuerySink(table='mydataset.mytable') options = PipelineOptions(flags=['--project', 'myproject']) sink.pipeline_options = options writer = sink.writer() self.assertEquals('myproject', writer.project_id)
def test_bad_types(self): # [START type_hints_missing_define_numbers] p = TestPipeline(options=PipelineOptions(pipeline_type_check=True)) numbers = p | beam.Create(['1', '2', '3']) # [END type_hints_missing_define_numbers] # Consider the following code. # pylint: disable=expression-not-assigned # pylint: disable=unused-variable # [START type_hints_missing_apply] evens = numbers | beam.Filter(lambda x: x % 2 == 0) # [END type_hints_missing_apply] # Now suppose numbers was defined as [snippet above]. # When running this pipeline, you'd get a runtime error, # possibly on a remote machine, possibly very late. with self.assertRaises(TypeError): p.run() # To catch this early, we can assert what types we expect. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_takes] evens = numbers | beam.Filter( lambda x: x % 2 == 0).with_input_types(int) # [END type_hints_takes] # Type hints can be declared on DoFns and callables as well, rather # than where they're used, to be more self contained. with self.assertRaises(typehints.TypeCheckError): # [START type_hints_do_fn] @beam.typehints.with_input_types(int) class FilterEvensDoFn(beam.DoFn): def process(self, element): if element % 2 == 0: yield element evens = numbers | beam.ParDo(FilterEvensDoFn()) # [END type_hints_do_fn] words = p | 'words' >> beam.Create(['a', 'bb', 'c']) # One can assert outputs and apply them to transforms as well. # Helps document the contract and checks it at pipeline construction time. # [START type_hints_transform] T = beam.typehints.TypeVariable('T') @beam.typehints.with_input_types(T) @beam.typehints.with_output_types(beam.typehints.Tuple[int, T]) class MyTransform(beam.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: (len(x), x)) words_with_lens = words | MyTransform() # [END type_hints_transform] # pylint: disable=expression-not-assigned with self.assertRaises(typehints.TypeCheckError): words_with_lens | beam.Map(lambda x: x).with_input_types( beam.typehints.Tuple[int, int])
def test_create_list_display_data(self): flags = ['--extra_package', 'package1', '--extra_package', 'package2'] pipeline_options = PipelineOptions(flags=flags) items = DisplayData.create_from_options(pipeline_options).items hc.assert_that(items, hc.contains_inanyorder( DisplayDataItemMatcher('extra_packages', str(['package1', 'package2']))))
def run(argv=None): known_args, pipeline_args = get_args(argv) options = PipelineOptions(pipeline_args) run_count1(known_args, options) run_count2(known_args, options) run_count3(known_args, options)
def run(argv=None): """Runs the debugging wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection, count the occurrences of # each word and filter by a list of words. filtered_words = ( p | 'read' >> ReadFromText(known_args.input) | CountWords() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # assert_that is a convenient PTransform that checks a PCollection has an # expected value. Asserts are best used in unit tests with small data sets but # is demonstrated here as a teaching tool. # # Note assert_that does not provide any output and that successful completion # of the Pipeline implies that the expectations were met. Learn more at # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to # test your pipeline. beam.assert_that(filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # Format the counts into a PCollection of strings and write the output using a # "Write" transform that has side effects. # pylint: disable=unused-variable output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'write' >> WriteToText(known_args.output)) # Actually run the pipeline (all operations above are deferred). p.run().wait_until_finish()
def test_with_extra_packages(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz2.tar'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'whl.whl'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), os.path.join(source_dir, 'xyz2.tar'), os.path.join(source_dir, 'whl.whl'), 'gs://my-gcs-bucket/gcs.tar.gz'] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) self.create_temp_file(os.path.join(to_path, from_name), 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual( ['abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def run(argv=None): """Runs the debugging wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection, count the occurrences of # each word and filter by a list of words. filtered_words = ( p | 'read' >> ReadFromText(known_args.input) | CountWords() | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach'))) # assert_that is a convenient PTransform that checks a PCollection has an # expected value. Asserts are best used in unit tests with small data sets but # is demonstrated here as a teaching tool. # # Note assert_that does not provide any output and that successful completion # of the Pipeline implies that the expectations were met. Learn more at # https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to # test your pipeline. beam.assert_that( filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)])) # Format the counts into a PCollection of strings and write the output using a # "Write" transform that has side effects. # pylint: disable=unused-variable output = (filtered_words | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) | 'write' >> WriteToText(known_args.output)) # Actually run the pipeline (all operations above are deferred). p.run().wait_until_finish()
def test_validate_dataflow_job_file(self): runner = MockRunners.OtherRunner() options = PipelineOptions([ '--dataflow_job_file', 'abc' ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertFalse(errors)
def test_table_schema_without_project(self): # Reader should pick executing project by default. source = beam.io.BigQuerySource(table='mydataset.mytable') options = PipelineOptions(flags=['--project', 'myproject']) source.pipeline_options = options reader = source.reader() self.assertEquals('SELECT * FROM [myproject:mydataset.mytable];', reader.query)
def test_sdk_location_default(self): staging_dir = tempfile.mkdtemp() expected_from_url = 'pypi' expected_from_path = self.override_pypi_download( expected_from_url, staging_dir) self.override_file_copy(expected_from_path, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = 'default' self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources( options, file_copy=dependency._dependency_file_copy))
def test_validate_template_location(self): runner = MockRunners.OtherRunner() options = PipelineOptions([ '--template_location', 'abc', ]) validator = PipelineOptionsValidator(options, runner) errors = validator.validate() self.assertFalse(errors)
def test_runtime_checks_on(self): # pylint: disable=expression-not-assigned p = TestPipeline(options=PipelineOptions(runtime_type_check=True)) with self.assertRaises(typehints.TypeCheckError): # [START type_hints_runtime_on] p | beam.Create(['a' ]) | beam.Map(lambda x: 3).with_output_types(str) p.run()
def model_bigqueryio(): """Using a Read and Write transform to read/write to BigQuery.""" import apache_beam as beam from apache_beam.utils.pipeline_options import PipelineOptions # [START model_bigqueryio_read] p = beam.Pipeline(options=PipelineOptions()) weather_data = p | beam.io.Read( 'ReadWeatherStations', beam.io.BigQuerySource( 'clouddataflow-readonly:samples.weather_stations')) # [END model_bigqueryio_read] # [START model_bigqueryio_query] p = beam.Pipeline(options=PipelineOptions()) weather_data = p | beam.io.Read( 'ReadYearAndTemp', beam.io.BigQuerySource( query='SELECT year, mean_temp FROM samples.weather_stations')) # [END model_bigqueryio_query] # [START model_bigqueryio_query_standard_sql] p = beam.Pipeline(options=PipelineOptions()) weather_data = p | beam.io.Read( 'ReadYearAndTemp', beam.io.BigQuerySource( query='SELECT year, mean_temp FROM `samples.weather_stations`', use_standard_sql=True)) # [END model_bigqueryio_query_standard_sql] # [START model_bigqueryio_schema] schema = 'source:STRING, quote:STRING' # [END model_bigqueryio_schema] # [START model_bigqueryio_write] quotes = p | beam.Create([{ 'source': 'Mahatma Ghandi', 'quote': 'My life is my message.' }]) quotes | beam.io.Write( 'Write', beam.io.BigQuerySink( 'my-project:output.output_table', schema=schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) # Actually run the pipeline (all operations above are deferred). result = p.run() result.wait_until_finish() empty_line_values = result.aggregated_values(empty_line_aggregator) logging.info('number of empty lines: %d', sum(empty_line_values.values())) word_length_values = result.aggregated_values(average_word_size_aggregator) logging.info('average word lengths: %s', word_length_values.values())
def test_sdk_location_local(self): staging_dir = tempfile.mkdtemp() sdk_location = tempfile.mkdtemp() self.create_temp_file( os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE), 'contents') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join(staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'contents')