def test_with_setup_file(self): staging_dir = self.make_temp_dir() source_dir = self.make_temp_dir() self.create_temp_file( os.path.join(source_dir, 'setup.py'), 'notused') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = os.path.join( source_dir, 'setup.py') self.assertEqual( [dependency.WORKFLOW_TARBALL_FILE], dependency.stage_job_resources( options, # We replace the build setup command because a realistic one would # require the setuptools package to be installed. Note that we can't # use "touch" here to create the expected output tarball file, since # touch is not available on Windows, so we invoke python to produce # equivalent behavior. build_setup_args=[ 'python', '-c', 'open(__import__("sys").argv[1], "a")', os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)], temp_dir=source_dir)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
def test_write_messages_unsupported_features(self, mock_pubsub): data = b'data' attributes = {'key': 'value'} payloads = [PubsubMessage(data, attributes)] options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) _ = (p | Create(payloads) | WriteToPubSub('projects/fakeprj/topics/a_topic', id_label='a_label')) with self.assertRaisesRegexp(NotImplementedError, r'id_label is not supported'): p.run() options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) _ = (p | Create(payloads) | WriteToPubSub('projects/fakeprj/topics/a_topic', timestamp_attribute='timestamp')) with self.assertRaisesRegexp(NotImplementedError, r'timestamp_attribute is not supported'): p.run()
def test_with_requirements_file(self): try: staging_dir = tempfile.mkdtemp() requirements_cache_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_cache = requirements_cache_dir options.view_as(SetupOptions).requirements_file = os.path.join( source_dir, dependency.REQUIREMENTS_FILE) self.create_temp_file( os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing') self.assertEqual( sorted([dependency.REQUIREMENTS_FILE, 'abc.txt', 'def.txt']), sorted(dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache))) self.assertTrue( os.path.isfile( os.path.join(staging_dir, dependency.REQUIREMENTS_FILE))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt'))) self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt'))) finally: shutil.rmtree(staging_dir) shutil.rmtree(requirements_cache_dir) shutil.rmtree(source_dir)
def test_sdk_location_http(self): staging_dir = self.make_temp_dir() sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location def file_download(_, to_folder): tarball_path = os.path.join(to_folder, 'sdk-tarball') with open(tarball_path, 'w') as f: f.write('Package content.') return tarball_path with mock.patch('apache_beam.runners.dataflow.internal.' 'dependency._dependency_file_download', file_download): self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options)) tarball_path = os.path.join( staging_dir, names.DATAFLOW_SDK_TARBALL_FILE) with open(tarball_path) as f: self.assertEqual(f.read(), 'Package content.')
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( '--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. if known_args.input_subscription: lines = p | beam.io.ReadStringsFromPubSub( subscription=known_args.input_subscription) else: lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write to PubSub. # pylint: disable=expression-not-assigned output | beam.io.WriteStringsToPubSub(known_args.output_topic) result = p.run() result.wait_until_finish()
def examples_wordcount_minimal(renames): """MinimalWordCount example snippets.""" import re import apache_beam as beam from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.options.pipeline_options import PipelineOptions # [START examples_wordcount_minimal_options] options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging' google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # [END examples_wordcount_minimal_options] # Run it locally for testing. options = PipelineOptions() # [START examples_wordcount_minimal_create] p = beam.Pipeline(options=options) # [END examples_wordcount_minimal_create] ( # [START examples_wordcount_minimal_read] p | beam.io.ReadFromText( 'gs://dataflow-samples/shakespeare/kinglear.txt') # [END examples_wordcount_minimal_read] # [START examples_wordcount_minimal_pardo] | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) # [END examples_wordcount_minimal_pardo] # [START examples_wordcount_minimal_count] | beam.combiners.Count.PerElement() # [END examples_wordcount_minimal_count] # [START examples_wordcount_minimal_map] | beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) # [END examples_wordcount_minimal_map] # [START examples_wordcount_minimal_write] | beam.io.WriteToText('gs://my-bucket/counts.txt') # [END examples_wordcount_minimal_write] ) p.visit(SnippetUtils.RenameFiles(renames)) # [START examples_wordcount_minimal_run] result = p.run() # [END examples_wordcount_minimal_run] result.wait_until_finish()
def test_get_all_options(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) self.assertDictContainsSubset(case['expected'], options.get_all_options()) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test_unknown_option_prefix(self): # Test that the "ambiguous option" error is suppressed. options = PipelineOptions(['--profi', 'val1']) options.view_as(ProfilingOptions) # Test that valid errors are not suppressed. with self.assertRaises(SystemExit): # Invalid option choice. options = PipelineOptions(['--type_check_strictness', 'blahblah']) options.view_as(TypeOptions)
def test_no_main_session(self): staging_dir = self.make_temp_dir() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = False self.update_options(options) self.assertEqual( [], dependency.stage_job_resources(options))
def test_sdk_location_gcs(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' self.override_file_copy(sdk_location, staging_dir) options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def test_requirements_file_not_present(self): staging_dir = self.make_temp_dir() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).requirements_file = 'nosuchfile' dependency.stage_job_resources( options, populate_requirements_cache=self.populate_requirements_cache) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % 'nosuchfile')
def run_pipeline(argv, with_attributes, id_label, timestamp_attribute): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument( '--input_subscription', required=True, help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. messages = p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription, id_label=id_label, with_attributes=with_attributes, timestamp_attribute=timestamp_attribute) def add_attribute(msg, timestamp=beam.DoFn.TimestampParam): msg.data += '-seen' msg.attributes['processed'] = 'IT' if timestamp_attribute in msg.attributes: msg.attributes[timestamp_attribute + '_out'] = timestamp.to_rfc3339() return msg def modify_data(data): return data + '-seen' if with_attributes: output = messages | 'add_attribute' >> beam.Map(add_attribute) else: output = messages | 'modify_data' >> beam.Map(modify_data) # Write to PubSub. _ = output | beam.io.WriteToPubSub(known_args.output_topic, id_label=id_label, with_attributes=with_attributes, timestamp_attribute=timestamp_attribute) result = p.run() result.wait_until_finish()
def test_with_main_session(self): staging_dir = self.make_temp_dir() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir options.view_as(SetupOptions).save_main_session = True self.update_options(options) self.assertEqual( [names.PICKLED_MAIN_SESSION_FILE], dependency.stage_job_resources(options)) self.assertTrue( os.path.isfile( os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
def test_setup_file_not_present(self): staging_dir = tempfile.mkdtemp() options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).setup_file = 'nosuchfile' with self.assertRaises(RuntimeError) as cm: dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % 'nosuchfile')
def test_with_extra_packages_missing_files(self): staging_dir = self.make_temp_dir() with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz'] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % 'nosuchfile.tar.gz')
def test_sdk_location_gcs_source_file(self): staging_dir = self.make_temp_dir() sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz' options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location with mock.patch('apache_beam.runners.dataflow.internal.' 'dependency._dependency_file_copy'): self.assertEqual( [names.DATAFLOW_SDK_TARBALL_FILE], dependency.stage_job_resources(options))
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def test_with_extra_packages(self): staging_dir = self.make_temp_dir() source_dir = self.make_temp_dir() self.create_temp_file( os.path.join(source_dir, 'abc.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz.tar.gz'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'xyz2.tar'), 'nothing') self.create_temp_file( os.path.join(source_dir, 'whl.whl'), 'nothing') self.create_temp_file( os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing') options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tar.gz'), os.path.join(source_dir, 'xyz.tar.gz'), os.path.join(source_dir, 'xyz2.tar'), os.path.join(source_dir, 'whl.whl'), 'gs://my-gcs-bucket/gcs.tar.gz'] gcs_copied_files = [] def file_copy(from_path, to_path): if from_path.startswith('gs://'): gcs_copied_files.append(from_path) _, from_name = os.path.split(from_path) if os.path.isdir(to_path): to_path = os.path.join(to_path, from_name) self.create_temp_file(to_path, 'nothing') logging.info('Fake copied GCS file: %s to %s', from_path, to_path) elif to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path) dependency._dependency_file_copy = file_copy self.assertEqual( ['abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz', dependency.EXTRA_PACKAGES_FILE], dependency.stage_job_resources(options)) with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f: self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n', 'gcs.tar.gz\n'], f.readlines()) self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
def test_sdk_location_local_not_present(self): staging_dir = self.make_temp_dir() sdk_location = 'nosuchdir' with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location dependency.stage_job_resources(options) self.assertEqual( 'The file "%s" cannot be found. Its ' 'location was specified by the --sdk_location command-line option.' % sdk_location, cm.exception.message)
def test_sdk_location_gcs_wheel_file(self): staging_dir = self.make_temp_dir() sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl' sdk_location = 'gs://my-gcs-bucket/' + sdk_filename options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).sdk_location = sdk_location with mock.patch('apache_beam.runners.dataflow.internal.' 'dependency._dependency_file_copy'): self.assertEqual( [sdk_filename], dependency.stage_job_resources(options))
def run(argv=None): """Main entry point; defines and runs the user_score pipeline.""" parser = argparse.ArgumentParser() # The default maps to two large Google Cloud Storage files (each ~12GB) # holding two subsequent day's worth (roughly) of data. parser.add_argument('--input', type=str, default='gs://apache-beam-samples/game/gaming_data*.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--output', type=str, required=True, help='Path to the output file(s).') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: def format_user_score_sums(user_score): (user, score) = user_score return 'user: %s, total_score: %s' % (user, score) (p # pylint: disable=expression-not-assigned | 'ReadInputText' >> beam.io.ReadFromText(args.input) | 'UserScore' >> UserScore() | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums) | 'WriteUserScoreSums' >> beam.io.WriteToText(args.output))
def test_read_messages_timestamp_attribute_missing(self, mock_pubsub): data = b'data' attributes = {} publish_time_secs = 1520861821 publish_time_nanos = 234567000 publish_time = '2018-03-12T13:37:01.234567Z' ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage( data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp.from_rfc3339(publish_time), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) pcoll = (p | ReadFromPubSub( 'projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='nonexistent')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls([ mock.call(mock.ANY, [ack_id])])
def test_model_composite_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a', 'a', 'a', 'b', 'b']) .advance_watermark_to(70) .add_elements([TimestampedValue('a', 10), TimestampedValue('a', 10), TimestampedValue('c', 10), TimestampedValue('c', 10)]) .advance_processing_time(600)) pcollection = (p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1))) counts = ( # [START model_composite_triggers] pcollection | WindowInto( FixedWindows(1 * 60), trigger=AfterWatermark( late=AfterProcessingTime(10 * 60)), accumulation_mode=AccumulationMode.DISCARDING) # [END model_composite_triggers] | 'group' >> beam.GroupByKey() | 'count' >> beam.Map( lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 2), ('c', 2)]))
def test_with_extra_packages_invalid_file_name(self): staging_dir = tempfile.mkdtemp() source_dir = tempfile.mkdtemp() self.create_temp_file( os.path.join(source_dir, 'abc.tgz'), 'nothing') with self.assertRaises(RuntimeError) as cm: options = PipelineOptions() options.view_as(GoogleCloudOptions).staging_location = staging_dir self.update_options(options) options.view_as(SetupOptions).extra_packages = [ os.path.join(source_dir, 'abc.tgz')] dependency.stage_job_resources(options) self.assertEqual( cm.exception.message, 'The --extra_package option expects a full path ending with ".tar" or ' '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
def run(pipeline_args, input_file, output_file): # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(input_file) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()) .with_output_types(bytes)) | 'count' >> beam.ExternalTransform( 'pytest:beam:transforms:count', None, EXPANSION_SERVICE_ADDR)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(output_file) result = p.run() result.wait_until_finish()
def run(argv=None): """Runs the Wikipedia top edits pipeline. Args: argv: Pipeline options as a list of arguments. """ parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/wikipedia_edits/*.json', help='Input specified as a GCS path containing a BigQuery table exported ' 'as json.') parser.add_argument('--output', required=True, help='Output file to write results to.') parser.add_argument('--sampling_threshold', type=float, default=0.1, help='Fraction of entries used for session tracking') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: (p # pylint: disable=expression-not-assigned | ReadFromText(known_args.input) | ComputeTopSessions(known_args.sampling_threshold) | WriteToText(known_args.output))
def model_pipelines(argv): """A wordcount snippet as a simple pipeline example.""" # [START model_pipelines] import re import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear' '.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) with beam.Pipeline(options=pipeline_options) as p: (p | beam.io.ReadFromText(my_options.input) | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | beam.Map(lambda x: (x, 1)) | beam.combiners.Count.PerKey() | beam.io.WriteToText(my_options.output))
def model_pcollection(argv): """Creating a PCollection from data in local memory.""" from apache_beam.options.pipeline_options import PipelineOptions class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') pipeline_options = PipelineOptions(argv) my_options = pipeline_options.view_as(MyOptions) # [START model_pcollection] with beam.Pipeline(options=pipeline_options) as p: lines = (p | beam.Create([ 'To be, or not to be: that is the question: ', 'Whether \'tis nobler in the mind to suffer ', 'The slings and arrows of outrageous fortune, ', 'Or to take arms against a sea of troubles, '])) # [END model_pcollection] (lines | beam.io.WriteToText(my_options.output))
def pipeline_monitoring(renames): """Using monitoring interface snippets.""" import re import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions class WordCountOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='output for the pipeline', default='gs://my-bucket/output') class ExtractWordsFn(beam.DoFn): def process(self, element): words = re.findall(r'[A-Za-z\']+', element) for word in words: yield word class FormatCountsFn(beam.DoFn): def process(self, element): word, count = element yield '%s: %s' % (word, count) # [START pipeline_monitoring_composite] # The CountWords Composite Transform inside the WordCount pipeline. class CountWords(beam.PTransform): def expand(self, pcoll): return (pcoll # Convert lines of text into individual words. | 'ExtractWords' >> beam.ParDo(ExtractWordsFn()) # Count the number of times each word occurs. | beam.combiners.Count.PerElement() # Format each word and count into a printable string. | 'FormatCounts' >> beam.ParDo(FormatCountsFn())) # [END pipeline_monitoring_composite] pipeline_options = PipelineOptions() options = pipeline_options.view_as(WordCountOptions) with TestPipeline() as p: # Use TestPipeline for testing. # [START pipeline_monitoring_execution] (p # Read the lines of the input text. | 'ReadLines' >> beam.io.ReadFromText(options.input) # Count the words. | CountWords() # Write the formatted word counts to output. | 'WriteCounts' >> beam.io.WriteToText(options.output)) # [END pipeline_monitoring_execution] p.visit(SnippetUtils.RenameFiles(renames))
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input_topic', required=True, help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) parser.add_argument( '--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) known_args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read from PubSub into a PCollection. lines = p | beam.io.ReadStringsFromPubSub(known_args.input_topic) # Capitalize the characters in each line. transformed = (lines # Use a pre-defined function that imports the re package. | 'Split' >> ( beam.FlatMap(split_fn).with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'Group' >> beam.GroupByKey() | 'Count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'Format' >> beam.Map(lambda tup: '%s: %d' % tup)) # Write to PubSub. # pylint: disable=expression-not-assigned transformed | beam.io.WriteStringsToPubSub(known_args.output_topic)
class NexmarkLauncher(object): def __init__(self): self.parse_args() self.uuid = str(uuid.uuid4()) self.topic_name = self.args.topic_name + self.uuid self.subscription_name = self.args.subscription_name + self.uuid def parse_args(self): parser = argparse.ArgumentParser() parser.add_argument('--query', '-q', type=int, action='append', required=True, choices=[0, 1, 2], help='Query to run') parser.add_argument('--subscription_name', type=str, help='Pub/Sub subscription to read from') parser.add_argument('--topic_name', type=str, help='Pub/Sub topic to read from') parser.add_argument( '--loglevel', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='INFO', help='Set logging level to debug') parser.add_argument( '--input', type=str, required=True, help='Path to the data file containing nexmark events.') self.args, self.pipeline_args = parser.parse_known_args() logging.basicConfig(level=getattr(logging, self.args.loglevel, None), format='(%(threadName)-10s) %(message)s') self.pipeline_options = PipelineOptions(self.pipeline_args) logging.debug('args, pipeline_args: %s, %s', self.args, self.pipeline_args) # Usage with Dataflow requires a project to be supplied. self.project = self.pipeline_options.view_as( GoogleCloudOptions).project if self.project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) # Pub/Sub is currently available for use only in streaming pipelines. self.streaming = self.pipeline_options.view_as( StandardOptions).streaming if self.streaming is None: parser.print_usage() print(sys.argv[0] + ': error: argument --streaming is required') sys.exit(1) # wait_until_finish ensures that the streaming job is canceled. self.wait_until_finish_duration = (self.pipeline_options.view_as( TestOptions).wait_until_finish_duration) if self.wait_until_finish_duration is None: parser.print_usage() print(sys.argv[0] + ': error: argument --wait_until_finish_duration is required') # pylint: disable=line-too-long sys.exit(1) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). self.pipeline_options.view_as(SetupOptions).save_main_session = True def generate_events(self): publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) if topic.exists(): topic.delete() topic.create() sub = topic.subscription(self.subscription_name) if sub.exists(): sub.delete() sub.create() logging.info('Generating auction events to topic %s', topic.name) if self.args.input.startswith('gs://'): from apache_beam.io.gcp.gcsfilesystem import GCSFileSystem fs = GCSFileSystem(self.pipeline_options) with fs.open(self.args.input) as infile: for line in infile: topic.publish(line) else: with open(self.args.input) as infile: for line in infile: topic.publish(line) logging.info('Finished event generation.') # Read from PubSub into a PCollection. if self.args.subscription_name: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=sub.full_name) else: raw_events = self.pipeline | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=topic.full_name) return raw_events def run_query(self, query, query_errors): try: self.pipeline = beam.Pipeline(options=self.pipeline_options) raw_events = self.generate_events() query.load(raw_events) result = self.pipeline.run() job_duration = (self.pipeline_options.view_as( TestOptions).wait_until_finish_duration) if self.pipeline_options.view_as(StandardOptions).runner == 'DataflowRunner': # pylint: disable=line-too-long result.wait_until_finish(duration=job_duration) result.cancel() else: result.wait_until_finish() except Exception as exc: query_errors.append(str(exc)) raise def cleanup(self): publish_client = pubsub.Client(project=self.project) topic = publish_client.topic(self.topic_name) if topic.exists(): topic.delete() sub = topic.subscription(self.subscription_name) if sub.exists(): sub.delete() def run(self): queries = { 0: query0, # TODO(mariagh): Add more queries. } query_errors = [] for i in self.args.query: self.parse_args() logging.info('Running query %d', i) # The DirectRunner is the default runner, and it needs # special handling to cancel streaming jobs. launch_from_direct_runner = self.pipeline_options.view_as( StandardOptions).runner in [None, 'DirectRunner'] if launch_from_direct_runner: command = Command(self.run_query, args=[queries[i], query_errors]) query_duration = self.pipeline_options.view_as(TestOptions).wait_until_finish_duration # pylint: disable=line-too-long command.run(timeout=query_duration // 1000) else: try: self.run_query(queries[i], query_errors=None) except Exception as exc: query_errors.append(exc) if query_errors: logging.error('Query failed with %s', ', '.join(query_errors)) else: logging.info('Queries run: %s', self.args.query)
class Pipeline(object): """A pipeline object that manages a DAG of PValues and their PTransforms. Conceptually the PValues are the DAG's nodes and the PTransforms computing the PValues are the edges. All the transforms applied to the pipeline must have distinct full labels. If same transform instance needs to be applied then a clone should be created with a new label (e.g., transform.clone('new label')). """ def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner: An object of type 'PipelineRunner' that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options: A configured 'PipelineOptions' object containing arguments that should be used for running the Dataflow job. argv: a list of arguments (such as sys.argv) to be used for building a 'PipelineOptions' object. This will only be used if argument 'options' is None. Raises: ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self._options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r', options) elif argv is not None: if isinstance(argv, list): self._options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r', argv) else: self._options = PipelineOptions([]) if runner is None: runner = self._options.view_as(StandardOptions).runner if runner is None: runner = StandardOptions.DEFAULT_RUNNER logging.info(('Missing pipeline option (runner). Executing pipeline ' 'using the default runner: %s.'), runner) if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner must be a PipelineRunner object or the ' 'name of a registered runner.') # Validate pipeline options errors = PipelineOptionsValidator(self._options, runner).validate() if errors: raise ValueError( 'Pipeline has validations errors: \n' + '\n'.join(errors)) # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set() @property @deprecated(since='First stable release', extra_message='References to <pipeline>.options' ' will not be supported') def options(self): return self._options def _current_transform(self): """Returns the transform currently on the top of the stack.""" return self.transforms_stack[-1] def _root_transform(self): """Returns the root transform of the transform stack.""" return self.transforms_stack[0] def run(self, test_runner_api=True): """Runs the pipeline. Returns whatever our runner returns after running.""" # When possible, invoke a round trip through the runner API. if test_runner_api and self._verify_runner_api_compatible(): return Pipeline.from_runner_api( self.to_runner_api(), self.runner, self._options).run(False) if self._options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run(self) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if not exc_type: self.run().wait_until_finish() def visit(self, visitor): """Visits depth-first every node of a pipeline's DAG. Args: visitor: PipelineVisitor object whose callbacks will be called for each node visited. See PipelineVisitor comments. Raises: TypeError: if node is specified and is not a PValue. pipeline.PipelineError: if node is specified and does not belong to this pipeline instance. """ visited = set() self._root_transform().visit(visitor, self, visited) def apply(self, transform, pvalueish=None, label=None): """Applies a custom transform using the pvalueish specified. Args: transform: the PTranform to apply. pvalueish: the input for the PTransform (typically a PCollection). label: label of the PTransform. Raises: TypeError: if the transform object extracted from the argument list is not a PTransform. RuntimeError: if the transform object was already applied to this pipeline and needs to be cloned in order to apply again. """ if isinstance(transform, ptransform._NamedPTransform): return self.apply(transform.transform, pvalueish, label or transform.label) if not isinstance(transform, ptransform.PTransform): raise TypeError("Expected a PTransform object, got %s" % transform) if label: # Fix self.label as it is inspected by some PTransform operations # (e.g. to produce error messages for type hint violations). try: old_label, transform.label = transform.label, label return self.apply(transform, pvalueish) finally: transform.label = old_label full_label = '/'.join([self._current_transform().full_label, label or transform.label]).lstrip('/') if full_label in self.applied_labels: raise RuntimeError( 'Transform "%s" does not have a stable unique label. ' 'This will prevent updating of pipelines. ' 'To apply a transform with a specified label write ' 'pvalue | "label" >> transform' % full_label) self.applied_labels.add(full_label) pvalueish, inputs = transform._extract_input_pvalues(pvalueish) try: inputs = tuple(inputs) for leaf_input in inputs: if not isinstance(leaf_input, pvalue.PValue): raise TypeError except TypeError: raise NotImplementedError( 'Unable to extract PValue inputs from %s; either %s does not accept ' 'inputs of this format, or it does not properly override ' '_extract_input_pvalues' % (pvalueish, transform)) current = AppliedPTransform( self._current_transform(), transform, full_label, inputs) self._current_transform().add_part(current) self.transforms_stack.append(current) type_options = self._options.view_as(TypeOptions) if type_options.pipeline_type_check: transform.type_check_inputs(pvalueish) pvalueish_result = self.runner.apply(transform, pvalueish) if type_options is not None and type_options.pipeline_type_check: transform.type_check_outputs(pvalueish_result) for result in ptransform.GetPValues().visit(pvalueish_result): assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple)) # Make sure we set the producer only for a leaf node in the transform DAG. # This way we preserve the last transform of a composite transform as # being the real producer of the result. if result.producer is None: result.producer = current # TODO(robertwb): Multi-input, multi-output inference. # TODO(robertwb): Ideally we'd do intersection here. if (type_options is not None and type_options.pipeline_type_check and isinstance(result, pvalue.PCollection) and not result.element_type): input_element_type = ( inputs[0].element_type if len(inputs) == 1 else typehints.Any) type_hints = transform.get_type_hints() declared_output_type = type_hints.simple_output_type(transform.label) if declared_output_type: input_types = type_hints.input_types if input_types and input_types[0]: declared_input_type = input_types[0][0] result.element_type = typehints.bind_type_variables( declared_output_type, typehints.match_type_variables(declared_input_type, input_element_type)) else: result.element_type = declared_output_type else: result.element_type = transform.infer_output_type(input_element_type) assert isinstance(result.producer.inputs, tuple) current.add_output(result) if (type_options is not None and type_options.type_check_strictness == 'ALL_REQUIRED' and transform.get_type_hints().output_types is None): ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label) raise TypeCheckError('Pipeline type checking is enabled, however no ' 'output type-hint was found for the ' 'PTransform %s' % ptransform_name) current.update_input_refcounts() self.transforms_stack.pop() return pvalueish_result def _verify_runner_api_compatible(self): class Visitor(PipelineVisitor): # pylint: disable=used-before-assignment ok = True # Really a nonlocal. def visit_transform(self, transform_node): if transform_node.side_inputs: # No side inputs (yet). Visitor.ok = False try: # Transforms must be picklable. pickler.loads(pickler.dumps(transform_node.transform, enable_trace=False), enable_trace=False) except Exception: Visitor.ok = False def visit_value(self, value, _): if isinstance(value, pvalue.PDone): Visitor.ok = False self.visit(Visitor()) return Visitor.ok def to_runner_api(self): from apache_beam.runners import pipeline_context from apache_beam.runners.api import beam_runner_api_pb2 context = pipeline_context.PipelineContext() # Mutates context; placing inline would force dependence on # argument evaluation order. root_transform_id = context.transforms.get_id(self._root_transform()) proto = beam_runner_api_pb2.Pipeline( root_transform_ids=[root_transform_id], components=context.to_runner_api()) return proto @staticmethod def from_runner_api(proto, runner, options): p = Pipeline(runner=runner, options=options) from apache_beam.runners import pipeline_context context = pipeline_context.PipelineContext(proto.components) root_transform_id, = proto.root_transform_ids p.transforms_stack = [ context.transforms.get_by_id(root_transform_id)] # TODO(robertwb): These are only needed to continue construction. Omit? p.applied_labels = set([ t.unique_name for t in proto.components.transforms.values()]) for id in proto.components.pcollections: context.pcollections.get_by_id(id).pipeline = p return p
def preprocess(): """ Arguments: -RUNNER: "DirectRunner" or "DataflowRunner". Specfy to run the pipeline locally or on Google Cloud respectively. Side-effects: -Creates and executes dataflow pipeline. See https://beam.apache.org/documentation/programming-guide/#creating-a-pipeline """ job_name = 'stackoverflow-raphael' + '-' + datetime.datetime.now( ).strftime('%y%m%d-%H%M%S') project = os.environ['PROJECT_ID'] region = os.environ['REGION'] output_dir = "gs://{0}/".format(os.environ['BUCKET_NAME']) #options options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = project google_cloud_options.region = region google_cloud_options.job_name = job_name google_cloud_options.staging_location = os.path.join( output_dir, 'beam', 'stage') google_cloud_options.temp_location = os.path.join(output_dir, 'beam', 'temp') worker_options = options.view_as(WorkerOptions) worker_options.max_num_workers = 100 worker_options.zone = 'europe-west6-b' worker_options.use_public_ips = False worker_options.network = 'default' # worker_options.disk_size_gb = 50 #options.view_as(StandardOptions).runner = RUNNER options.view_as( SetupOptions).setup_file = os.environ['DIR_PROJ'] + '/setup.py' # instantantiate Pipeline object using PipelineOptions print('Launching Dataflow job {} ... hang on'.format(job_name)) #table reference new_table = beam.io.gcp.internal.clients.bigquery.TableReference( projectId='nlp-text-classification', datasetId='stackoverflow', tableId='posts_preprocessed') with beam.Pipeline(options=options) as p: post_table = p | "Read Posts from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=data_query(), use_standard_sql=True)) #tag_table = p | "Read Tags from BigQuery" >> beam.io.Read(beam.io.BigQuerySource( #query=tag_query(), #use_standard_sql=True)) clean_text = post_table | "Preprocessing" >> beam.ParDo(pp.NLP()) clean_text | "Write Posts to BigQuery" >> beam.io.WriteToBigQuery( new_table, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) str_values = clean_text | "Post Records to Text" >> beam.ParDo( pp.CSV()) str_values | "Write Posts to GCS" >> beam.io.WriteToText( output_dir + 'results/posts', file_name_suffix='.csv', header='id, title, text_body, code_body, tags') if options.view_as(StandardOptions).runner == 'DataflowRunner': print('DataflowRunner') p.run() else: print('Default: DirectRunner') result = p.run() result.wait_until_finish() print('Done')
class UserOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): # Estos parametros se usan para la ejecucion de la plantilla y no se deben indicar cuando se crea la plantilla parser.add_value_provider_argument("--url_raw", type=str) parser.add_value_provider_argument("--url_trn", type=str) # Estos parametros se usan para la creacion de la plantilla y corresponden a datos que quedan estaticos dentro de la misma parser.add_value_provider_argument("--rename_columns", type=str) parser.add_value_provider_argument("--schema_source", type=str) pipeline_options = PipelineOptions() with beam.Pipeline(options=pipeline_options) as p: print("Start Pipeline") user_options = pipeline_options.view_as(UserOptions) # Esta funcion renombra las "columnas" y regresa los datos renombrados def reColumns(row, rename_cols=None): for col in rename_cols: dict_rename = { value: row[key] for (key, value) in ast.literal_eval(col).items() } return dict_rename # Esta funcion recibe el parametro rename_columns y calcula un diccionario con las parejas de nombres de columnas def mapRenameCols(row, rename_cols=ast.literal_eval( user_options.rename_columns.get())): cols_before = list(row)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') parser.add_argument('--output_team', type=str, required=True, help='Pub/Sub topic to write team score') parser.add_argument('--output_user', type=str, required=True, help='Pub/Sub topic to write user score.') parser.add_argument('--subscription', type=str, help='Pub/Sub subscription to read from') parser.add_argument('--team_window_duration', type=int, default=3, help='Numeric value of fixed window duration for team ' 'analysis, in minutes') parser.add_argument( '--allowed_lateness', type=int, default=6, help='Numeric value of allowed data lateness, in minutes') args, pipeline_args = parser.parse_known_args(argv) if args.topic is None and args.subscription is None: parser.print_usage() print(sys.argv[0] + ': error: one of --topic or --subscription is required') sys.exit(1) options = PipelineOptions(pipeline_args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = save_main_session # Enforce that this pipeline is always run in streaming mode options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read game events from Pub/Sub using custom timestamps, which are extracted # from the pubsub data elements, and parse the data. # Read from PubSub into a PCollection. if args.subscription: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=args.subscription) else: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=args.topic) events = (scores | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) | 'AddEventTimestamps' >> beam.Map(lambda elem: beam.window.TimestampedValue( elem, elem['timestamp']))) def format_team_score_sums(team_score): team = team_score['team'] score = team_score['total_score'] print(team_score) return '%s: %d' % (team, score) # Get team scores and write the results to the topic output_team ( # pylint: disable=expression-not-assigned events | 'CalculateTeamScores' >> CalculateTeamScores( args.team_window_duration, args.allowed_lateness) | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'FormatTeamScoreSums' >> beam.Map(format_team_score_sums) | 'EncodeTeamScoreSums' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes) | 'WriteTeamScoreSums' >> beam.io.WriteToPubSub(args.output_team)) def format_user_score_sums(user_score): (user, score) = user_score print(user_score) return '%s: %d' % (user, score) # Get user scores and write the results to the topic output_user ( # pylint: disable=expression-not-assigned events | 'CalculateUserScores' >> CalculateUserScores(args.allowed_lateness) | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums) | 'EncodeUserScoreSums' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes) | 'WriteUserScoreSums' >> beam.io.WriteToPubSub(args.output_user))
required=True, help="Month for input data") parser.add_argument("--input.day", dest="input_day", required=True, help="Day for input data") parser.add_argument("--input.hour", dest="input_hour", required=True, help="Hour for input data") parser.add_argument("--bq.project", dest="bq_project", required=True, help="Project Name for Bigquery") parser.add_argument("--bq.dataset", dest="bq_dataset", required=True, help="Dataset Name for Bigquery") parser.add_argument("--bq.table", dest="bq_table", required=True, help="Table Name for Bigquery") app_args, pipeline_args = parser.parse_known_args() pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True logging.getLogger().setLevel(logging.INFO) main(pipeline_options, app_args)
def run(argv=None): """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline that transforms bitcoin transactions""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://beam-avro-test/bitcoin/txns/*', help='Input file(s) to process.') parser.add_argument( '--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument( '--compress', dest='compress', required=False, action='store_true', help='When set, compress the output data') parser.add_argument( '--fastavro', dest='use_fastavro', required=False, action='store_true', help='When set, use fastavro for Avro I/O') opts, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the avro file[pattern] into a PCollection. records = \ p | 'read' >> ReadFromAvro(opts.input) measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn()) # pylint: disable=expression-not-assigned measured | 'write' >> \ WriteToAvro( opts.output, schema=SCHEMA, codec=('deflate' if opts.compress else 'null'), ) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation metrics = result.metrics().query() for counter in metrics['counters']: logging.info("Counter: %s", counter) for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
class FilteringDoFn(beam.DoFn): def __init__(self, filter_val): self.filter_val = filter_val def process(self, element): if element['gender'] == self.filter_val.get(): yield element else: return # Return nothing logging.getLogger().setLevel(logging.INFO) pipeline_options = PipelineOptions() # Create pipeline. with beam.Pipeline(options=pipeline_options) as p: def print_row(element): logging.info("the count is ", element) my_options = pipeline_options.view_as(DataflowExample) select_query = (p | 'QueryTableStdSQL' >> beam.io.Read(beam.io.BigQuerySource( query='SELECT gender FROM ' \ '`startgcp-268623.lake.usa_names`', use_standard_sql=True))) select_query | beam.ParDo(FilteringDoFn(my_options.filter_val)) | beam.combiners.Count.Globally() \ | 'Print result' >> beam.Map(print_row) p.run().wait_until_finish()
from __future__ import absolute_import import logging import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions output_table = 'automatic-asset-253215:CORE.IM_CUSTOMER_ATTRIBUTE_REF' dataflow_options = {'--project=automatic-asset-253215', '--job_name=xfm-vustgclic-custsorgext-to-imcustomerattributeref', '--temp_location=gs://raw_source_files/Customers/temp', '--staging_location=gs://raw_source_files/Customers/temp/stg'} options = PipelineOptions(dataflow_options) gcloud_options = options.view_as(GoogleCloudOptions) options.view_as(StandardOptions).runner = 'dataflow' class LeftJoin(beam.PTransform): """This PTransform performs a left join given source_pipeline_name, source_data, join_pipeline_name, join_data, common_key constructors""" def __init__(self, src_pipeline, CustOrg_ID, join_pipeline, IMCust_ID, common_key): self.join_pipeline = join_pipeline self.CustOrg_ID = CustOrg_ID self.src_pipeline = src_pipeline self.IMCust_ID = IMCust_ID self.common_key = common_key def expand(self, pcolls): def _format_as_common_key_tuple(data_dict, common_key):
class Pipeline(object): """A pipeline object that manages a DAG of :class:`~apache_beam.pvalue.PValue` s and their :class:`~apache_beam.transforms.ptransform.PTransform` s. Conceptually the :class:`~apache_beam.pvalue.PValue` s are the DAG's nodes and the :class:`~apache_beam.transforms.ptransform.PTransform` s computing the :class:`~apache_beam.pvalue.PValue` s are the edges. All the transforms applied to the pipeline must have distinct full labels. If same transform instance needs to be applied then the right shift operator should be used to designate new names (e.g. ``input | "label" >> my_tranform``). """ def __init__(self, runner=None, options=None, argv=None): """Initialize a pipeline object. Args: runner (~apache_beam.runners.runner.PipelineRunner): An object of type :class:`~apache_beam.runners.runner.PipelineRunner` that will be used to execute the pipeline. For registered runners, the runner name can be specified, otherwise a runner object must be supplied. options (~apache_beam.options.pipeline_options.PipelineOptions): A configured :class:`~apache_beam.options.pipeline_options.PipelineOptions` object containing arguments that should be used for running the Beam job. argv (List[str]): a list of arguments (such as :data:`sys.argv`) to be used for building a :class:`~apache_beam.options.pipeline_options.PipelineOptions` object. This will only be used if argument **options** is :data:`None`. Raises: ~exceptions.ValueError: if either the runner or options argument is not of the expected type. """ if options is not None: if isinstance(options, PipelineOptions): self._options = options else: raise ValueError( 'Parameter options, if specified, must be of type PipelineOptions. ' 'Received : %r' % options) elif argv is not None: if isinstance(argv, list): self._options = PipelineOptions(argv) else: raise ValueError( 'Parameter argv, if specified, must be a list. Received : %r' % argv) else: self._options = PipelineOptions([]) FileSystems.set_options(self._options) if runner is None: runner = self._options.view_as(StandardOptions).runner if runner is None: runner = StandardOptions.DEFAULT_RUNNER logging.info( ('Missing pipeline option (runner). Executing pipeline ' 'using the default runner: %s.'), runner) if isinstance(runner, str): runner = create_runner(runner) elif not isinstance(runner, PipelineRunner): raise TypeError('Runner %s is not a PipelineRunner object or the ' 'name of a registered runner.' % runner) # Validate pipeline options errors = PipelineOptionsValidator(self._options, runner).validate() if errors: raise ValueError('Pipeline has validations errors: \n' + '\n'.join(errors)) # set default experiments for portable runners # (needs to occur prior to pipeline construction) if runner.is_fnapi_compatible(): experiments = (self._options.view_as(DebugOptions).experiments or []) if not 'beam_fn_api' in experiments: experiments.append('beam_fn_api') self._options.view_as(DebugOptions).experiments = experiments # Default runner to be used. self.runner = runner # Stack of transforms generated by nested apply() calls. The stack will # contain a root node as an enclosing (parent) node for top transforms. self.transforms_stack = [AppliedPTransform(None, None, '', None)] # Set of transform labels (full labels) applied to the pipeline. # If a transform is applied and the full label is already in the set # then the transform will have to be cloned with a new label. self.applied_labels = set() @property @deprecated(since='First stable release', extra_message='References to <pipeline>.options' ' will not be supported') def options(self): return self._options def _current_transform(self): """Returns the transform currently on the top of the stack.""" return self.transforms_stack[-1] def _root_transform(self): """Returns the root transform of the transform stack.""" return self.transforms_stack[0] def _remove_labels_recursively(self, applied_transform): for part in applied_transform.parts: if part.full_label in self.applied_labels: self.applied_labels.remove(part.full_label) self._remove_labels_recursively(part) def _replace(self, override): assert isinstance(override, PTransformOverride) output_map = {} output_replacements = {} input_replacements = {} side_input_replacements = {} class TransformUpdater(PipelineVisitor): # pylint: disable=used-before-assignment """"A visitor that replaces the matching PTransforms.""" def __init__(self, pipeline): self.pipeline = pipeline def _replace_if_needed(self, original_transform_node): if override.matches(original_transform_node): assert isinstance(original_transform_node, AppliedPTransform) replacement_transform = override.get_replacement_transform( original_transform_node.transform) if replacement_transform is original_transform_node.transform: return replacement_transform_node = AppliedPTransform( original_transform_node.parent, replacement_transform, original_transform_node.full_label, original_transform_node.inputs) # Transform execution could depend on order in which nodes are # considered. Hence we insert the replacement transform node to same # index as the original transform node. Note that this operation # removes the original transform node. if original_transform_node.parent: assert isinstance(original_transform_node.parent, AppliedPTransform) parent_parts = original_transform_node.parent.parts parent_parts[parent_parts.index( original_transform_node)] = ( replacement_transform_node) else: # Original transform has to be a root. roots = self.pipeline.transforms_stack[0].parts assert original_transform_node in roots roots[roots.index(original_transform_node)] = ( replacement_transform_node) inputs = replacement_transform_node.inputs # TODO: Support replacing PTransforms with multiple inputs. if len(inputs) > 1: raise NotImplementedError( 'PTransform overriding is only supported for PTransforms that ' 'have a single input. Tried to replace input of ' 'AppliedPTransform %r that has %d inputs' % original_transform_node, len(inputs)) elif len(inputs) == 1: input_node = inputs[0] elif len(inputs) == 0: input_node = pvalue.PBegin(self) # We have to add the new AppliedTransform to the stack before expand() # and pop it out later to make sure that parts get added correctly. self.pipeline.transforms_stack.append( replacement_transform_node) # Keeping the same label for the replaced node but recursively # removing labels of child transforms of original transform since they # will be replaced during the expand below. This is needed in case # the replacement contains children that have labels that conflicts # with labels of the children of the original. self.pipeline._remove_labels_recursively( original_transform_node) new_output = replacement_transform.expand(input_node) new_output.element_type = None self.pipeline._infer_result_type(replacement_transform, inputs, new_output) replacement_transform_node.add_output(new_output) if not new_output.producer: new_output.producer = replacement_transform_node # We only support replacing transforms with a single output with # another transform that produces a single output. # TODO: Support replacing PTransforms with multiple outputs. if (len(original_transform_node.outputs) > 1 or not isinstance( original_transform_node.outputs[None], (PCollection, PDone)) or not isinstance(new_output, (PCollection, PDone))): raise NotImplementedError( 'PTransform overriding is only supported for PTransforms that ' 'have a single output. Tried to replace output of ' 'AppliedPTransform %r with %r.' % (original_transform_node, new_output)) # Recording updated outputs. This cannot be done in the same visitor # since if we dynamically update output type here, we'll run into # errors when visiting child nodes. output_map[ original_transform_node.outputs[None]] = new_output self.pipeline.transforms_stack.pop() def enter_composite_transform(self, transform_node): self._replace_if_needed(transform_node) def visit_transform(self, transform_node): self._replace_if_needed(transform_node) self.visit(TransformUpdater(self)) # Adjusting inputs and outputs class InputOutputUpdater(PipelineVisitor): # pylint: disable=used-before-assignment """"A visitor that records input and output values to be replaced. Input and output values that should be updated are recorded in maps input_replacements and output_replacements respectively. We cannot update input and output values while visiting since that results in validation errors. """ def __init__(self, pipeline): self.pipeline = pipeline def enter_composite_transform(self, transform_node): self.visit_transform(transform_node) def visit_transform(self, transform_node): if (None in transform_node.outputs and transform_node.outputs[None] in output_map): output_replacements[transform_node] = ( output_map[transform_node.outputs[None]]) replace_input = False for input in transform_node.inputs: if input in output_map: replace_input = True break replace_side_inputs = False for side_input in transform_node.side_inputs: if side_input.pvalue in output_map: replace_side_inputs = True break if replace_input: new_input = [ input if not input in output_map else output_map[input] for input in transform_node.inputs ] input_replacements[transform_node] = new_input if replace_side_inputs: new_side_inputs = [] for side_input in transform_node.side_inputs: if side_input.pvalue in output_map: side_input.pvalue = output_map[side_input.pvalue] new_side_inputs.append(side_input) else: new_side_inputs.append(side_input) side_input_replacements[transform_node] = new_side_inputs self.visit(InputOutputUpdater(self)) for transform in output_replacements: transform.replace_output(output_replacements[transform]) for transform in input_replacements: transform.inputs = input_replacements[transform] for transform in side_input_replacements: transform.side_inputs = side_input_replacements[transform] def _check_replacement(self, override): class ReplacementValidator(PipelineVisitor): def visit_transform(self, transform_node): if override.matches(transform_node): raise RuntimeError( 'Transform node %r was not replaced as expected.' % transform_node) self.visit(ReplacementValidator()) def replace_all(self, replacements): """ Dynamically replaces PTransforms in the currently populated hierarchy. Currently this only works for replacements where input and output types are exactly the same. TODO: Update this to also work for transform overrides where input and output types are different. Args: replacements (List[~apache_beam.pipeline.PTransformOverride]): a list of :class:`~apache_beam.pipeline.PTransformOverride` objects. """ for override in replacements: assert isinstance(override, PTransformOverride) self._replace(override) # Checking if the PTransforms have been successfully replaced. This will # result in a failure if a PTransform that was replaced in a given override # gets re-added in a subsequent override. This is not allowed and ordering # of PTransformOverride objects in 'replacements' is important. for override in replacements: self._check_replacement(override) def run(self, test_runner_api=True): """Runs the pipeline. Returns whatever our runner returns after running.""" # When possible, invoke a round trip through the runner API. if test_runner_api and self._verify_runner_api_compatible(): return Pipeline.from_runner_api( self.to_runner_api(use_fake_coders=True), self.runner, self._options).run(False) if self._options.view_as(TypeOptions).runtime_type_check: from apache_beam.typehints import typecheck self.visit(typecheck.TypeCheckVisitor()) if self._options.view_as(SetupOptions).save_main_session: # If this option is chosen, verify we can pickle the main session early. tmpdir = tempfile.mkdtemp() try: pickler.dump_session( os.path.join(tmpdir, 'main_session.pickle')) finally: shutil.rmtree(tmpdir) return self.runner.run_pipeline(self, self._options) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if not exc_type: self.run().wait_until_finish() def visit(self, visitor): """Visits depth-first every node of a pipeline's DAG. Runner-internal implementation detail; no backwards-compatibility guarantees Args: visitor (~apache_beam.pipeline.PipelineVisitor): :class:`~apache_beam.pipeline.PipelineVisitor` object whose callbacks will be called for each node visited. See :class:`~apache_beam.pipeline.PipelineVisitor` comments. Raises: ~exceptions.TypeError: if node is specified and is not a :class:`~apache_beam.pvalue.PValue`. ~apache_beam.error.PipelineError: if node is specified and does not belong to this pipeline instance. """ visited = set() self._root_transform().visit(visitor, self, visited) def apply(self, transform, pvalueish=None, label=None): """Applies a custom transform using the pvalueish specified. Args: transform (~apache_beam.transforms.ptransform.PTransform): the :class:`~apache_beam.transforms.ptransform.PTransform` to apply. pvalueish (~apache_beam.pvalue.PCollection): the input for the :class:`~apache_beam.transforms.ptransform.PTransform` (typically a :class:`~apache_beam.pvalue.PCollection`). label (str): label of the :class:`~apache_beam.transforms.ptransform.PTransform`. Raises: ~exceptions.TypeError: if the transform object extracted from the argument list is not a :class:`~apache_beam.transforms.ptransform.PTransform`. ~exceptions.RuntimeError: if the transform object was already applied to this pipeline and needs to be cloned in order to apply again. """ if isinstance(transform, ptransform._NamedPTransform): return self.apply(transform.transform, pvalueish, label or transform.label) if not isinstance(transform, ptransform.PTransform): raise TypeError("Expected a PTransform object, got %s" % transform) if label: # Fix self.label as it is inspected by some PTransform operations # (e.g. to produce error messages for type hint violations). try: old_label, transform.label = transform.label, label return self.apply(transform, pvalueish) finally: transform.label = old_label full_label = '/'.join( [self._current_transform().full_label, label or transform.label]).lstrip('/') if full_label in self.applied_labels: raise RuntimeError( 'A transform with label "%s" already exists in the pipeline. ' 'To apply a transform with a specified label write ' 'pvalue | "label" >> transform' % full_label) self.applied_labels.add(full_label) pvalueish, inputs = transform._extract_input_pvalues(pvalueish) try: inputs = tuple(inputs) for leaf_input in inputs: if not isinstance(leaf_input, pvalue.PValue): raise TypeError except TypeError: raise NotImplementedError( 'Unable to extract PValue inputs from %s; either %s does not accept ' 'inputs of this format, or it does not properly override ' '_extract_input_pvalues' % (pvalueish, transform)) current = AppliedPTransform(self._current_transform(), transform, full_label, inputs) self._current_transform().add_part(current) self.transforms_stack.append(current) type_options = self._options.view_as(TypeOptions) if type_options.pipeline_type_check: transform.type_check_inputs(pvalueish) pvalueish_result = self.runner.apply(transform, pvalueish, self._options) if type_options is not None and type_options.pipeline_type_check: transform.type_check_outputs(pvalueish_result) for result in ptransform.get_nested_pvalues(pvalueish_result): assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple)) # Make sure we set the producer only for a leaf node in the transform DAG. # This way we preserve the last transform of a composite transform as # being the real producer of the result. if result.producer is None: result.producer = current self._infer_result_type(transform, inputs, result) assert isinstance(result.producer.inputs, tuple) current.add_output(result) if (type_options is not None and type_options.type_check_strictness == 'ALL_REQUIRED' and transform.get_type_hints().output_types is None): ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label) raise TypeCheckError( 'Pipeline type checking is enabled, however no ' 'output type-hint was found for the ' 'PTransform %s' % ptransform_name) self.transforms_stack.pop() return pvalueish_result def _infer_result_type(self, transform, inputs, result_pcollection): # TODO(robertwb): Multi-input, multi-output inference. type_options = self._options.view_as(TypeOptions) if (type_options is not None and type_options.pipeline_type_check and isinstance(result_pcollection, pvalue.PCollection) and (not result_pcollection.element_type # TODO(robertwb): Ideally we'd do intersection here. or result_pcollection.element_type == typehints.Any)): input_element_type = (inputs[0].element_type if len(inputs) == 1 else typehints.Any) type_hints = transform.get_type_hints() declared_output_type = type_hints.simple_output_type( transform.label) if declared_output_type: input_types = type_hints.input_types if input_types and input_types[0]: declared_input_type = input_types[0][0] result_pcollection.element_type = typehints.bind_type_variables( declared_output_type, typehints.match_type_variables(declared_input_type, input_element_type)) else: result_pcollection.element_type = declared_output_type else: result_pcollection.element_type = transform.infer_output_type( input_element_type) def __reduce__(self): # Some transforms contain a reference to their enclosing pipeline, # which in turn reference all other transforms (resulting in quadratic # time/space to pickle each transform individually). As we don't # require pickled pipelines to be executable, break the chain here. return str, ('Pickled pipeline stub.', ) def _verify_runner_api_compatible(self): if self._options.view_as(TypeOptions).runtime_type_check: # This option is incompatible with the runner API as it requires # the runner to inspect non-serialized hints on the transform # itself. return False class Visitor(PipelineVisitor): # pylint: disable=used-before-assignment ok = True # Really a nonlocal. def enter_composite_transform(self, transform_node): pass def visit_transform(self, transform_node): try: # Transforms must be picklable. pickler.loads(pickler.dumps(transform_node.transform, enable_trace=False), enable_trace=False) except Exception: Visitor.ok = False def visit_value(self, value, _): if isinstance(value, pvalue.PDone): Visitor.ok = False self.visit(Visitor()) return Visitor.ok def to_runner_api(self, return_context=False, context=None, use_fake_coders=False, default_environment=None): """For internal use only; no backwards-compatibility guarantees.""" from apache_beam.runners import pipeline_context from apache_beam.portability.api import beam_runner_api_pb2 if context is None: context = pipeline_context.PipelineContext( use_fake_coders=use_fake_coders, default_environment=default_environment) elif default_environment is not None: raise ValueError( 'Only one of context or default_environment may be specified.') # The RunnerAPI spec requires certain transforms and side-inputs to have KV # inputs (and corresponding outputs). # Currently we only upgrade to KV pairs. If there is a need for more # general shapes, potential conflicts will have to be resolved. # We also only handle single-input, and (for fixing the output) single # output, which is sufficient. class ForceKvInputTypes(PipelineVisitor): def enter_composite_transform(self, transform_node): self.visit_transform(transform_node) def visit_transform(self, transform_node): if not transform_node.transform: return if transform_node.transform.runner_api_requires_keyed_input(): pcoll = transform_node.inputs[0] pcoll.element_type = typehints.coerce_to_kv_type( pcoll.element_type, transform_node.full_label) if len(transform_node.outputs) == 1: # The runner often has expectations about the output types as well. output, = transform_node.outputs.values() if not output.element_type: output.element_type = transform_node.transform.infer_output_type( pcoll.element_type) for side_input in transform_node.transform.side_inputs: if side_input.requires_keyed_input(): side_input.pvalue.element_type = typehints.coerce_to_kv_type( side_input.pvalue.element_type, transform_node.full_label, side_input_producer=side_input.pvalue.producer. full_label) self.visit(ForceKvInputTypes()) # Mutates context; placing inline would force dependence on # argument evaluation order. root_transform_id = context.transforms.get_id(self._root_transform()) proto = beam_runner_api_pb2.Pipeline( root_transform_ids=[root_transform_id], components=context.to_runner_api()) proto.components.transforms[root_transform_id].unique_name = ( root_transform_id) if return_context: return proto, context else: return proto @staticmethod def from_runner_api(proto, runner, options, return_context=False, allow_proto_holders=False): """For internal use only; no backwards-compatibility guarantees.""" p = Pipeline(runner=runner, options=options) from apache_beam.runners import pipeline_context context = pipeline_context.PipelineContext( proto.components, allow_proto_holders=allow_proto_holders) root_transform_id, = proto.root_transform_ids p.transforms_stack = [context.transforms.get_by_id(root_transform_id)] # TODO(robertwb): These are only needed to continue construction. Omit? p.applied_labels = set( [t.unique_name for t in proto.components.transforms.values()]) for id in proto.components.pcollections: pcollection = context.pcollections.get_by_id(id) pcollection.pipeline = p if not pcollection.producer: raise ValueError('No producer for %s' % id) # Inject PBegin input where necessary. from apache_beam.io.iobase import Read from apache_beam.transforms.core import Create has_pbegin = [Read, Create] for id in proto.components.transforms: transform = context.transforms.get_by_id(id) if not transform.inputs and transform.transform.__class__ in has_pbegin: transform.inputs = (pvalue.PBegin(p), ) if return_context: return p, context else: return p
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, required=True, help='Pub/Sub topic to read from') parser.add_argument('--dataset', type=str, required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument('--table_name', type=str, default='game_stats', help='The BigQuery table name. Should not already exist.') parser.add_argument('--fixed_window_duration', type=int, default=60, help='Numeric value of fixed window duration for user ' 'analysis, in minutes') parser.add_argument('--session_gap', type=int, default=5, help='Numeric value of gap between user sessions, ' 'in minutes') parser.add_argument('--user_activity_window_duration', type=int, default=30, help='Numeric value of fixed window for finding mean of ' 'user session duration, in minutes') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) fixed_window_duration = args.fixed_window_duration * 60 session_gap = args.session_gap * 60 user_activity_window_duration = args.user_activity_window_duration * 60 # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True # Enforce that this pipeline is always run in streaming mode options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read events from Pub/Sub using custom timestamps raw_events = ( p | 'ReadPubSub' >> beam.io.gcp.pubsub.ReadStringsFromPubSub(args.topic) | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) | 'AddEventTimestamps' >> beam.Map( lambda elem: beam.window.TimestampedValue(elem, elem['timestamp']))) # Extract username/score pairs from the event stream user_events = ( raw_events | 'ExtractUserScores' >> beam.Map( lambda elem: (elem['user'], elem['score']))) # Calculate the total score per user over fixed windows, and cumulative # updates for late data spammers_view = ( user_events | 'UserFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. # These might be robots/spammers. | 'CalculateSpammyUsers' >> CalculateSpammyUsers() # Derive a view from the collection of spammer users. It will be used as # a side input in calculating the team score sums, below | 'CreateSpammersView' >> beam.CombineGlobally( beam.combiners.ToDictCombineFn()).as_singleton_view()) # [START filter_and_calc] # Calculate the total score per team over fixed windows, and emit cumulative # updates for late data. Uses the side input derived above --the set of # suspected robots-- to filter out scores from those users from the sum. # Write the results to BigQuery. (raw_events # pylint: disable=expression-not-assigned | 'WindowIntoFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out the detected spammer users, using the side input derived above | 'FilterOutSpammers' >> beam.Filter( lambda elem, spammers: elem['user'] not in spammers, spammers_view) # Extract and sum teamname/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('team') # [END filter_and_calc] | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.table_name + '_teams', args.dataset, { 'team': 'STRING', 'total_score': 'INTEGER', 'window_start': 'STRING', 'processing_time': 'STRING', })) # [START session_calc] # Detect user sessions-- that is, a burst of activity separated by a gap # from further activity. Find and record the mean session lengths. # This information could help the game designers track the changing user # engagement as their set of game changes. (user_events # pylint: disable=expression-not-assigned | 'WindowIntoSessions' >> beam.WindowInto( beam.window.Sessions(session_gap), timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW) # For this use, we care only about the existence of the session, not any # particular information aggregated over it, so we can just group by key # and assign a "dummy value" of None. | beam.CombinePerKey(lambda _: None) # Get the duration of the session | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity()) # [END session_calc] # [START rewindow] # Re-window to process groups of session sums according to when the # sessions complete | 'WindowToExtractSessionMean' >> beam.WindowInto( beam.window.FixedWindows(user_activity_window_duration)) # Find the mean session duration in each window | beam.CombineGlobally(beam.combiners.MeanCombineFn()).without_defaults() | 'FormatAvgSessionLength' >> beam.Map( lambda elem: {'mean_duration': float(elem)}) | 'WriteAvgSessionLength' >> WriteToBigQuery( args.table_name + '_sessions', args.dataset, { 'mean_duration': 'FLOAT', }))
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/5: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input) # Count the occurrences of each word. counts = ( lines | 'Split' >> ( beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)). with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(known_args.output)
# help='Input for the pipeline', # default='gs://cxr-to-chest-ct/') # parser.add_argument('--output', # help='Output for the pipeline', # default='gs://cxr-to-chest-ct2/resampled/') # parser.add_argument('--project', # dest='project', # help='Project', # default='x-ray-reconstruction') # parser.add_argument('--temp_location', # dest='temp_location', # help='temp_location', # default='gs://cxr-to-chest-ct2/tmp/') options = PipelineOptions(flags=sys.argv) google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'x-ray-reconstruction' google_cloud_options.job_name = 'numpy-highmem-int16-with-rotation' google_cloud_options.staging_location = 'gs://cxr-to-chest-ct2/binaries' google_cloud_options.temp_location = 'gs://cxr-to-chest-ct2/temp' # google_cloud_options.machine_type = 'n1-highmem-2' options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: # embed() # dicom_urls = p | 'read csv data' >> beam.io.Read(CsvFileSource('gs://cxr-to-chest-ct/datasets/LIDC-IDRI Dataset/ct_scan_urls.csv')) dicom_urls = p | 'read csv file' >> beam.io.textio.ReadFromText( 'gs://cxr-to-chest-ct/datasets/LIDC-IDRI Dataset/ct_scan_urls.csv' ) | 'split stuff' >> beam.ParDo(Split())
import apache_beam as beam import config import json from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions from apache_beam.options.pipeline_options import SetupOptions from apache_beam.io.gcp.internal.clients import bigquery from textblob import TextBlob options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = config.PROJECT_ID google_cloud_options.staging_location = 'gs://dod-mwja-project1/staging' google_cloud_options.temp_location = 'gs://dod-mwja-project1/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' options.view_as(StandardOptions).streaming = True def compute_sentiment(line): import os os.system('sudo pip install textblob') from textblob import TextBlob templist = line.split('-=-') for j, item in enumerate(templist): templist[j] = item.replace(',', '') tweet = templist[1] sent = TextBlob(tweet).sentiment.polarity
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the translate pipeline.""" parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) # Define pipeline options. pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) def translate_text(text_dict, target="en"): """Translates text into the target language. Target must be an ISO 639-1 language code. See https://g.co/cloud/translate/v2/translate-reference#supported_languages Args: text_dict: Dictionary format input. """ import six from google.cloud import translate_v2 as translate text = text_dict[text_column_in] translate_client = translate.Client() if isinstance(text, six.binary_type): text = text.decode("utf-8") # Text can also be a sequence of strings, in which case this method # will return a sequence of results for each text. result = translate_client.translate(text, target_language=target) result_str = result['translatedText'] # Construct dict matching output table schema table_schema_out. return {text_column_out: result_str, text_column_in: text} # Debug. Test translate_text fn. text_dict = {} text_dict[text_column_in] = '寿司は美味しです' print(translate_text(text_dict)) translate_jp2en = ( p | 'Read table from BQ' >> beam.io.ReadFromBigQuery(table=table_spec_in) # Debug. # | 'Create dict' >> beam.Create([ # { # 'td_title': '魚も美味しいです' # }, # { # 'td_title': '寿司は美味しいです' # }, # ]) # Each row is a dictionary where the keys are the BigQuery columns | 'Translating' >> beam.Map(translate_text)) # Debug. Print translated jp texts. # translate_jp2en | 'Print' >> beam.Map(print) # Write translated data back to a new table in BigQuery. translate_jp2en | 'Write back to BQ' >> beam.io.WriteToBigQuery( table_spec_out, schema=table_schema_out, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) p.run()
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--kind', dest='kind', required=True, help='Datastore Kind') parser.add_argument('--namespace', dest='namespace', help='Datastore Namespace') parser.add_argument('--ancestor', dest='ancestor', default='root', help='The ancestor key name for all entities.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') parser.add_argument('--read_only', action='store_true', help='Read an existing dataset, do not write first') parser.add_argument( '--num_shards', dest='num_shards', type=int, # If the system should choose automatically. default=0, help='Number of output shards') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) # Write to Datastore if `read_only` options is not specified. if not known_args.read_only: write_to_datastore(gcloud_options.project, known_args, pipeline_options) # Read entities from Datastore. result = read_from_datastore(gcloud_options.project, known_args, pipeline_options) empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.committed) else: logging.warn('unable to retrieve counter metrics from runner') word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.committed.mean) else: logging.warn('unable to retrieve distribution metrics from runner')
def run(): import pickle import sys import math import numpy as np reload(sys) sys.setdefaultencoding('utf8') from gensim.models import KeyedVectors import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from google.cloud.proto.datastore.v1 import query_pb2 from apache_beam.io.textio import WriteToText import nltk.data import re import uuid import perceptron _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle") abbreviations = set() with open("./tokenizer/abbreviations-long.txt") as f: for l in f: abbreviations.add(l.split(':')[0]) _sentence_tokenizer._params.abbrev_types = abbreviations model_file = "perceptron_word2vec_stemmed_normalized.pickle" with open(model_file, 'rb') as model: w, b = pickle.load(model) def sentences_from_text(text): return _sentence_tokenizer.tokenize(text.strip()) def tokens_from_sentence(sentence): return sentence.split(" ") # nltk.word_tokenize(sentence) def ngrams(obj, n): tokens = [] sentences = ( sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"]) ) for sentence in sentences: tokens += tokens_from_sentence(sentence) pairs = nltk.ngrams(tokens, n) return [" ".join(pair) for pair in pairs] def convertToObject(jsonObj): x = jsonObj link = x.properties.get('link', None) link = link.string_value if link else "" title = x.properties.get('title', None) title = title.string_value if title else "" description = x.properties.get("description", None) description = description.string_value if description else "" content = x.properties.get("text", "") content = content.string_value if content else "" published = x.properties.get("published") published = published.string_value if published else "" obj = { "link": link, "title": title, "description": description, "content": content, "published": published } obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4()) return obj # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def removeHTMLFromStrings(obj): for key in obj.keys(): obj[key] = cleanhtml(obj[key]) return obj def tokenize_to_sentences(obj): obj["sentences"] = ( sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"]) ) return obj def tokenize_to_words(obj): obj["tokens"] = [] for sentence in obj["sentences"]: obj["tokens"] += tokens_from_sentence(sentence) for token in obj["tokens"]: yield (obj["key"], token) options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'news-197916' google_cloud_options.job_name = 'sentiment-analysis' google_cloud_options.staging_location = 'gs://news-197916.appspot.com/word_count/' google_cloud_options.temp_location = 'gs://news-197916.appspot.com/df_tmp' options.view_as(StandardOptions).runner = 'DataflowRunner' setup_options = options.view_as(SetupOptions) setup_options.requirements_file = "requirements.txt" setup_options.save_main_session = True p = beam.Pipeline(options=options) query = query_pb2.Query() query.kind.add().name = "News_Entry" pairs = (p | 'Read From Datastore' >> ReadFromDatastore(project = google_cloud_options.project, query=query) # | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line # | "Convert to Json Object" >> beam.Map(convertToJsonObj) | "Convert to Python Object" >> beam.Map(convertToObject) | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings) ) tokens_1gram = (pairs | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences) | 'Word Tokenization' >> beam.FlatMap(tokenize_to_words) # also convert to key value pairs ) """ tokens_2gram = (pairs | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)]) ) """ tokens = tokens_1gram """ vocabulary = (tokens | "Get words only" >> beam.Values() | "Remove duplicate words" >> beam.RemoveDuplicates() ) vocabulary_size = (vocabulary | "Count Vocabulary elements" >> beam.combiners.Count.Globally() ) doc_total_words = (tokens | "Count Words of Doc" >> beam.combiners.Count.PerKey() ) """ tokens_paired_with_1 = (tokens | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1)) ) """ token_counts_per_doc = (tokens_paired_with_1 | "Group by Doc,Word" >> beam.GroupByKey() | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts)))) | "Group by Doc" >> beam.GroupByKey() ) num_docs = (token_counts_per_doc | "Get Docs" >> beam.Keys() | "Count Docs" >> beam.combiners.Count.Globally() ) word_tf_pre = ( { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc } | "CoGroup By Document" >> beam.CoGroupByKey() ) def calc_tf((doc, count)): [token_count] = count['token_counts_per_doc'] [tokens_total] = count['total_tokens'] for token, cnt in token_count: yield token, (doc, float(cnt) / tokens_total) doc_word_tf = (word_tf_pre | "Compute Term Frequencies" >> beam.FlatMap(calc_tf) ) word_occurrences = (tokens | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates() | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1)) | "Group by Word" >> beam.GroupByKey() | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts))) ) token_df = ( word_occurrences | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs))) token_tf_df = ( { 'term_frequency': doc_word_tf, 'document_frequency': token_df} | "CoGroup By Token" >> beam.CoGroupByKey()) def calc_tfidf((token, tfdf)): [df] = tfdf['document_frequency'] for doc, tf in tfdf['term_frequency']: yield (doc, token), tf * math.log(1.0 / df) token_tf_idf = (token_tf_df | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf) ) """ word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) def get_vec(word2vec, token): if word2vec is None: word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) try: x = word2vec.get_vector(token) x = x.reshape(400) except: x = np.zeros(400) return x def analyze_sentiment(x): res = perceptron.f(x, w, b) return res doc_sentiment = (tokens_paired_with_1 | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token))) | "Group Word2Vec Vectors By Document" >> beam.GroupByKey() | "Sum Word2Vec Vectors" >> beam.Map( lambda (doc, vecs): (doc, analyze_sentiment(np.sum(vecs, axis=0))[0])) ) result = (doc_sentiment | "Format Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens)) ) (result | "Write Results" >> WriteToText("sentiments") ) p.run()
def _setup_pipeline(self): options = PipelineOptions(self.pipeline.get_full_options_as_args()) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True self.pipeline = TestPipeline(options=options)
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') # Read 'Anomaly' entities from datastore. entities = ( p | 'ReadFromDatastore(Anomaly)' >> ReadTimestampRangeFromDatastore( {'project': project, 'kind': 'Anomaly'}, time_range_provider=bq_export_options.GetTimeRangeProvider())) def AnomalyEntityToRowDict(entity): entities_read.inc() try: # We do the iso conversion of the nullable timestamps in isolation. earliest_input_timestamp = entity.get('earliest_input_timestamp') if earliest_input_timestamp: earliest_input_timestamp = earliest_input_timestamp.isoformat() latest_input_timestamp = entity.get('latest_input_timestamp') if latest_input_timestamp: latest_input_timestamp = latest_input_timestamp.isoformat() d = { 'id': entity.key.id, # TODO: 'sheriff' # 'subscriptions' omitted; subscription_names is sufficient 'subscription_names': entity.get('subscription_names', []), 'test': TestPath(entity['test']), 'start_revision': entity['start_revision'], 'end_revision': entity['end_revision'], 'display_start': entity.get('display_start'), 'display_end': entity.get('display_end'), # TODO: 'ownership' 'statistic': entity['statistic'], 'bug_id': entity['bug_id'], 'internal_only': entity['internal_only'], 'timestamp': entity['timestamp'].isoformat(), 'segment_size_before': entity.get('segment_size_before'), 'segment_size_after': entity.get('segment_size_after'), 'median_before_anomaly': entity.get('median_before_anomaly'), 'median_after_anomaly': entity.get('median_after_anomaly'), 'std_dev_before_anomaly': entity.get('std_dev_before_anomaly'), 'window_end_revision': entity.get('window_end_revision'), 't_statistic': FloatHack(entity.get('t_statistic')), 'degrees_of_freedom': entity.get('degrees_of_freedom'), 'p_value': entity.get('p_value'), 'is_improvement': entity.get('is_improvement', False), 'recovered': entity.get('recovered', False), # TODO: 'ref_test' 'units': entity.get('units'), # TODO: 'recipe_bisects' 'pinpoint_bisects': entity.get('pinpoint_bisects', []), # These are critical to "time-to-culprit" calculations. 'earliest_input_timestamp': earliest_input_timestamp, 'latest_input_timestamp': latest_input_timestamp, } if d['statistic'] is None: # Some years-old anomalies lack this. raise UnconvertibleAnomalyError() return [d] except (KeyError, UnconvertibleAnomalyError): failed_entity_transforms.inc() return [] anomaly_dicts = ( entities | 'ConvertEntityToRow(Anomaly)' >> beam.FlatMap(AnomalyEntityToRowDict)) """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.anomalies` (id INT64 NOT NULL, `timestamp` TIMESTAMP NOT NULL, subscription_names ARRAY<STRING>, `test` STRING NOT NULL, start_revision INT64 NOT NULL, end_revision INT64 NOT NULL, display_start INT64, display_end INT64, statistic STRING NOT NULL, bug_id INT64, internal_only BOOLEAN NOT NULL, segment_size_before INT64, segment_size_after INT64, median_before_anomaly FLOAT64, median_after_anomaly FLOAT64, std_dev_before_anomaly FLOAT64, window_end_revision INT64, t_statistic FLOAT64, degrees_of_freedom FLOAT64, p_value FLOAT64, is_improvement BOOLEAN NOT NULL, recovered BOOLEAN NOT NULL, units STRING, pinpoint_bisects ARRAY<STRING>, earliest_input_timestamp TIMESTAMP, latest_input_timestamp TIMESTAMP) PARTITION BY DATE(`timestamp`); """ # pylint: disable=pointless-string-statement bq_anomaly_schema = { 'fields': [ { 'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'subscription_names', 'type': 'STRING', 'mode': 'REPEATED' }, { 'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'start_revision', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'end_revision', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'display_start', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'display_end', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'statistic', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bug_id', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'internal_only', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'segment_size_before', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'segment_size_after', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'median_before_anomaly', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'median_after_anomaly', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'std_dev_before_anomaly', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'window_end_revision', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 't_statistic', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'degrees_of_freedom', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'p_value', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'is_improvement', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'recovered', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'units', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'pinpoint_bisects', 'type': 'STRING', 'mode': 'REPEATED' }, { 'name': 'earliest_input_timestamp', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }, { 'name': 'latest_input_timestamp', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }, ] } # 'dataset' may be a RuntimeValueProvider, so we have to defer calculating # the table name until runtime. The simplest way to do this is by passing a # function for the table name rather than a string. def TableNameFn(unused_element): return '{}:{}.anomalies{}'.format(project, bq_export_options.dataset.get(), bq_export_options.table_suffix) _ = ( anomaly_dicts | 'WriteToBigQuery(anomalies)' >> WriteToPartitionedBigQuery(TableNameFn, bq_anomaly_schema)) result = p.run() result.wait_until_finish() PrintCounters(result)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--questions', dest='questions', required=True, help='Questions file.') parser.add_argument('--users', dest='users', required=True, help='Users file.') parser.add_argument('--from-ts', dest='from_ts', required=True, type=int, help='Start of the time range.') parser.add_argument('--to-ts', dest='to_ts', required=True, type=int, help='End of the time range.') parser.add_argument( '--engagement-range', dest='engagement_range', default=10, type=int, help= 'Maximum number of days from first step to the last step of an engagement.' ) parser.add_argument('--giap-es-index', dest='giap_es_index', required=True, help='GIAP ES index.') parser.add_argument('--giap-es-username', dest='giap_es_username', required=True, help='GIAP ES username.') parser.add_argument('--giap-es-password', dest='giap_es_password', required=True, help='GIAP ES password.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: recent_questions = get_recent_questions(p, known_args.questions) recently_active_users = get_recently_active_users(p, known_args.users) latest_engagements = get_latest_engagements( p, from_ts=known_args.from_ts, to_ts=known_args.to_ts, engagement_range=known_args.engagement_range) question_engagement_pairs = ({ 'questions': recent_questions, 'engagements': latest_engagements, 'users': recently_active_users }) | "Group by uid" >> beam.CoGroupByKey() calculateAskEngagement = CalculateAskEngagement() calculateAskEngagement.engagement_range = known_args.engagement_range calculateAskEngagement.from_ts = known_args.from_ts calculateAskEngagement.to_ts = known_args.to_ts calculateAskEngagement.giap_es_index = known_args.giap_es_index calculateAskEngagement.giap_es_username = known_args.giap_es_username calculateAskEngagement.giap_es_password = known_args.giap_es_password engagement_table_spec = bigquery.TableReference( projectId='gotit-analytics', datasetId='study_pn_campaign', tableId='engagement') new_engagements = ( question_engagement_pairs | "Calculate 'ask' engagements" >> beam.ParDo(calculateAskEngagement) | 'Write result to BQ' >> beam.io.WriteToBigQuery( engagement_table_spec, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, ))
"""Main function""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--topic', default='molecules-predictions', help='PubSub topic to subscribe for predictions.') args, pipeline_args = parser.parse_known_args() beam_options = PipelineOptions( pipeline_args, save_main_session=True, streaming=True, ) project = beam_options.view_as(GoogleCloudOptions).project if not project: parser.print_usage() print('error: argument --project is required') sys.exit(1) # We'll just log the results logging.basicConfig(level=logging.INFO) logging.info('Listening...') topic_path = 'projects/{}/topics/{}'.format(project, args.topic) with beam.Pipeline(options=beam_options) as p: _ = (p | 'Read predictions' >> beam.io.ReadFromPubSub(topic=topic_path) | 'Log' >> beam.Map(logging.info))
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() # The default maps to two large Google Cloud Storage files (each ~12GB) # holding two subsequent day's worth (roughly) of data. parser.add_argument('--input', type=str, default='gs://apache-beam-samples/game/gaming_data*.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--dataset', type=str, required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument('--table_name', default='leader_board', help='The BigQuery table name. Should not already exist.') parser.add_argument('--window_duration', type=int, default=60, help='Numeric value of fixed window duration, in minutes') parser.add_argument('--start_min', type=str, default='1970-01-01-00-00', help='String representation of the first minute after ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'prior to that minute won\'t be included in the ' 'sums.') parser.add_argument('--stop_min', type=str, default='2100-01-01-00-00', help='String representation of the first minute for ' 'which to generate results in the format: ' 'yyyy-MM-dd-HH-mm. Any input data timestamped ' 'after to that minute won\'t be included in the ' 'sums.') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: (p # pylint: disable=expression-not-assigned | 'ReadInputText' >> beam.io.ReadFromText(args.input) | 'HourlyTeamScore' >> HourlyTeamScore( args.start_min, args.stop_min, args.window_duration) | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.table_name, args.dataset, { 'team': 'STRING', 'total_score': 'INTEGER', 'window_start': 'STRING', }))
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') parser.add_argument('--subscription', type=str, help='Pub/Sub subscription to read from') parser.add_argument('--dataset', type=str, required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument( '--table_name', default='leader_board', help='The BigQuery table name. Should not already exist.') parser.add_argument('--team_window_duration', type=int, default=60, help='Numeric value of fixed window duration for team ' 'analysis, in minutes') parser.add_argument( '--allowed_lateness', type=int, default=120, help='Numeric value of allowed data lateness, in minutes') args, pipeline_args = parser.parse_known_args(argv) if args.topic is None and args.subscription is None: parser.print_usage() print(sys.argv[0] + ': error: one of --topic or --subscription is required') sys.exit(1) options = PipelineOptions(pipeline_args) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = save_main_session # Enforce that this pipeline is always run in streaming mode options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read game events from Pub/Sub using custom timestamps, which are extracted # from the pubsub data elements, and parse the data. # Read from PubSub into a PCollection. if args.subscription: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=args.subscription) else: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=args.topic) events = (scores | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) | 'AddEventTimestamps' >> beam.Map(lambda elem: beam.window.TimestampedValue( elem, elem['timestamp']))) # Get team scores and write the results to BigQuery (events # pylint: disable=expression-not-assigned | 'CalculateTeamScores' >> CalculateTeamScores( args.team_window_duration, args.allowed_lateness) | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.table_name + '_teams', args.dataset, { 'team': 'STRING', 'total_score': 'INTEGER', 'window_start': 'STRING', 'processing_time': 'STRING', }, options.view_as(GoogleCloudOptions).project)) def format_user_score_sums(user_score): (user, score) = user_score return {'user': user, 'total_score': score} # Get user scores and write the results to BigQuery (events # pylint: disable=expression-not-assigned | 'CalculateUserScores' >> CalculateUserScores(args.allowed_lateness) | 'FormatUserScoreSums' >> beam.Map(format_user_score_sums) | 'WriteUserScoreSums' >> WriteToBigQuery( args.table_name + '_users', args.dataset, { 'user': '******', 'total_score': 'INTEGER', }, options.view_as(GoogleCloudOptions).project))
def run(argv=None): """Main entry point""" parser = argparse.ArgumentParser() # parser.add_argument('--project', type=str, required=False, help='project') parser.add_argument( '--records', dest='records', type=int, # default='gs://dataflow-samples/shakespeare/kinglear.txt', default='10', # gsutil cp gs://dataflow-samples/shakespeare/kinglear.txt help='Number of records to be generate') parser.add_argument('--output', dest='output', required=False, default='./', help='Output file to write results to.') # Parse arguments from the command line. known_args, pipeline_args = parser.parse_known_args(argv) # Store the CLI arguments to variables # project_id = known_args.project # Setup the dataflow pipeline options pipeline_options = PipelineOptions(pipeline_args) # pipeline_options.view_as(SetupOptions).save_main_session = True # google_cloud_options = pipeline_options.view_as(GoogleCloudOptions) # google_cloud_options.project = project_id save_main_session = True pipeline_options.view_as( SetupOptions).save_main_session = save_main_session # SCHEMA_STRING = ''' # {"namespace": "example.avro", # "type": "record", # "name": "User", # "fields": [ # {"name": "ACNO", "type": "int"}, # {"name": "PRIN_BAL", "type": "int"}, # {"name": "FEE_ANT", "default": null, "type": ["null", "double"]}, # {"name": "GENDER", "default": null, "type": ["null", {"logicalType": "char", "type": "string", "maxLength": 1}]} # ] # } # ''' SCHEMA = { "namespace": "example.avro", "type": "record", "name": "User", "fields": [{ "name": "ACNO", "type": [ "null", { "logicalType": "char", "type": "string", "maxLength": 20 } ] }, { "name": "FIELD_1", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }, { "name": "FIELD_2", "type": [ "null", { "logicalType": "char", "type": "float", "maxLength": 20 } ] }] } # {"name": "GENDER', "type": "string"} # {"name": "FEE_ANT", "type": "long"} # p = beam.Pipeline(options=pipeline_options) rec_cnt = known_args.records with beam.Pipeline(options=pipeline_options) as p: left_pcol_name = 'p1' file = p | 'read_source' >> beam.io.ReadFromAvro( "./data/account_id_schema_new.avro") p1 = file | beam.Map(lambda x: { 'ACNO': x['ACNO'], 'FIELD_1': x["FIELD_1"] }) p2 = file | beam.Map(lambda x: { 'ACNO': x['ACNO'], 'FIELD_2': x["FIELD_2"] }) P1_1 = p1 | "write" >> beam.io.WriteToText('./data.csv') P2_2 = p2 | "write2" >> beam.io.WriteToText('./data2.csv') right_pcol_name = 'p2' join_keys = { left_pcol_name: [ 'ACNO' # 't1_col_B' ], right_pcol_name: [ 'ACNO' # 't2_col_B' ] } pipelines_dictionary = {left_pcol_name: p1, right_pcol_name: p2} test_pipeline = pipelines_dictionary | 'left join' >> Join( left_pcol_name=left_pcol_name, left_pcol=p1, right_pcol_name=right_pcol_name, right_pcol=p2, join_type='left', join_keys=join_keys) print(type(test_pipeline)) test_pipeline | "print" >> beam.io.WriteToText('./test.csv') compressIdc = True use_fastavro = True # test_pipeline | 'write_fastavro' >> WriteToAvro( known_args.output, # '/tmp/dataflow/{}/{}'.format( # 'demo', 'output'), # parse_schema(json.loads(SCHEMA_STRING)), parse_schema(SCHEMA), use_fastavro=use_fastavro, file_name_suffix='.avro', codec=('deflate' if compressIdc else 'null'), ) result = p.run() result.wait_until_finish()
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') # Read 'Job' entities from datastore. job_entities = ( p | 'ReadFromDatastore(Job)' >> ReadTimestampRangeFromDatastore( { 'project': project, 'kind': 'Job' }, time_range_provider=bq_export_options.GetTimeRangeProvider(), timestamp_property='created')) def ConvertEntity(entity): entities_read.inc() try: row_dict = JobEntityToRowDict(entity) except UnconvertibleJobError: logging.getLogger().exception('Failed to convert Job') failed_entity_transforms.inc() return [] return [row_dict] job_dicts = (job_entities | 'ConvertEntityToRow(Job)' >> beam.FlatMap(ConvertEntity)) """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.jobs` (id INT64 NOT NULL, arguments STRING NOT NULL, bug_id INT64, comparison_mode STRING, gerrit STRUCT<server STRING, change_id STRING>, name STRING, tags STRING, user_email STRING, create_time TIMESTAMP NOT NULL, start_time TIMESTAMP, update_time TIMESTAMP NOT NULL, started BOOLEAN NOT NULL, done BOOLEAN NOT NULL, cancelled BOOLEAN NOT NULL, cancel_reason STRING, task STRING, exception STRING, exception_details STRING, difference_count INT64, retry_count INT64 NOT NULL, benchmark_arguments STRUCT<benchmark STRING, story STRING, story_tags STRING, chart STRING, statistic STRING>, use_execution_engine BOOLEAN NOT NULL, completed BOOLEAN NOT NULL, failed BOOLEAN NOT NULL, running BOOLEAN NOT NULL, configuration STRING) PARTITION BY DATE(`create_time`); """ # pylint: disable=pointless-string-statement bq_job_schema = { 'fields': [ { 'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'arguments', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bug_id', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'comparison_mode', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'gerrit', 'type': 'RECORD', 'mode': 'NULLABLE', 'fields': [ { 'name': 'server', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'change_id', 'type': 'STRING', 'mode': 'NULLABLE' }, ] }, { 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'tags', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'user_email', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'create_time', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'start_time', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }, { 'name': 'update_time', 'type': 'TIMESTAMP', 'mode': 'REQUIRED' }, { 'name': 'started', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'done', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'cancelled', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'cancel_reason', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'task', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'exception', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'exception_details', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'difference_count', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'retry_count', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'benchmark_arguments', 'type': 'RECORD', 'mode': 'NULLABLE', 'fields': [ { 'name': 'benchmark', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'story', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'story_tags', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'chart', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'statistic', 'type': 'STRING', 'mode': 'NULLABLE' }, ] }, { 'name': 'use_execution_engine', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'completed', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'failed', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'running', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'configuration', 'type': 'STRING', 'mode': 'NULLABLE' }, ] } # 'dataset' may be a RuntimeValueProvider, so we have to defer calculating # the table name until runtime. The simplest way to do this is by passing a # function for the table name rather than a string. def TableNameFn(unused_element): return '{}:{}.jobs{}'.format(project, bq_export_options.dataset.get(), bq_export_options.table_suffix) _ = job_dicts | 'WriteToBigQuery(jobs)' >> WriteToPartitionedBigQuery( TableNameFn, bq_job_schema, element_to_yyyymmdd_fn=_JobToYYYYMMDD) result = p.run() result.wait_until_finish() PrintCounters(result)
def run(argv=[]): project_id = 'grass-clump-479' instance_id = 'python-write-2' DEFAULT_TABLE_PREFIX = "python-test" #table_id = DEFAULT_TABLE_PREFIX + "-" + str(uuid.uuid4())[:8] #table_id = 'testmillionb38c02c4' #table_id = 'testmillioned113e20' #table_id = 'testmillion2ee87b99' guid = str(uuid.uuid4())[:8] table_id = 'testboth' + guid jobname = 'testmillion-both-' + guid argv.extend([ '--experiments=beam_fn_api', '--project={}'.format(project_id), '--instance={}'.format(instance_id), '--table={}'.format(table_id), '--projectId={}'.format(project_id), '--instanceId={}'.format(instance_id), '--tableId={}'.format(table_id), '--job_name={}'.format(jobname), '--requirements_file=requirements.txt', '--disk_size_gb=100', '--region=us-central1', '--runner=dataflow', #'--runner=directRunner', '--autoscaling_algorithm=NONE', '--num_workers=100', '--staging_location=gs://juantest/stage', '--temp_location=gs://juantest/temp', '--setup_file=C:\\Users\\Juan\\Project\\python\\example_bigtable_beam\\beam_bigtable_package\\setup.py', # '--setup_file=/usr/src/app/example_bigtable_beam/beam_bigtable_package/setup.py', '--extra_package=C:\\Users\\Juan\\Project\\python\\example_bigtable_beam\\beam_bigtable_package\\dist\\beam_bigtable-0.3.106.tar.gz' # '--extra_package=/usr/src/app/example_bigtable_beam/beam_bigtable_package/dist/beam_bigtable-0.3.30.tar.gz' ]) parser = argparse.ArgumentParser(argv) parser.add_argument('--projectId') parser.add_argument('--instanceId') parser.add_argument('--tableId') (known_args, pipeline_args) = parser.parse_known_args(argv) create_table = CreateAll(project_id, instance_id, table_id) print('ProjectID:',project_id) print('InstanceID:',instance_id) print('TableID:',table_id) print('JobID:', jobname) create_table.create_table() pipeline_options = PipelineOptions(argv) pipeline_options.view_as(SetupOptions).save_main_session = True row_count = 10000 row_limit = 100 row_step = row_count if row_count <= row_limit else row_count/row_limit with beam.Pipeline(options=pipeline_options) as p: second_step = (p | 'Ranges' >> beam.Create([(str(i),str(i+row_step)) for i in xrange(0, row_count, row_step)]) | 'Group' >> beam.GroupByKey() | 'Generate' >> beam.ParDo(GenerateRow()) | 'Write' >> WriteToBigTable(project_id=project_id, instance_id=instance_id, table_id=table_id) | 'BigtableFromRead' >> ReadFromBigTable_Read(project_id=project_id, instance_id=instance_id, table_id=table_id)) count = (second_step | 'Count' >> beam.combiners.Count.Globally()) row_count = 10000 assert_that(count, equal_to([row_count])) result = p.run()
def run(argv=None, save_main_session=True): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read from PubSub into a PCollection. if known_args.input_subscription: messages = (p | beam.io.ReadFromPubSub( subscription=known_args.input_subscription). with_output_types(bytes)) else: messages = ( p | beam.io.ReadFromPubSub( topic=known_args.input_topic).with_output_types(bytes)) lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.FixedWindows(15, 0)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = ( counts | 'format' >> beam.Map(format_result) | 'encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)) # Write to PubSub. # pylint: disable=expression-not-assigned output | beam.io.WriteToPubSub(known_args.output_topic)
def main(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def run(argv=None): """Runs the workflow.""" known_args, pipeline_args = parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True input_info = known_args.input with TestPipeline(options=pipeline_options) as p: source = SyntheticSource(input_info) # pylint: disable=expression-not-assigned barrier = known_args.barrier pc_list = [] num_roots = 2**(len(known_args.steps) - 1) if (barrier == 'merge-gbk' or barrier == 'merge-side-input') else 1 for read_no in range(num_roots): pc_list.append((p | ('Read %d' % read_no) >> beam.io.Read(source))) for step_no, steps in enumerate(known_args.steps): if step_no != 0: new_pc_list = [] for pc_no, pc in enumerate(pc_list): if barrier == 'shuffle': new_pc_list.append( (pc | ('shuffle %d.%d' % (step_no, pc_no)) >> ShuffleBarrier())) elif barrier == 'side-input': new_pc_list.append( (pc | ('side-input %d.%d' % (step_no, pc_no)) >> SideInputBarrier())) elif barrier == 'expand-gbk': new_pc_list.extend( expand_using_gbk( ('expand-gbk %d.%d' % (step_no, pc_no)), pc)) elif barrier == 'expand-second-output': new_pc_list.extend( expand_using_second_output( ('expand-second-output %d.%d' % (step_no, pc_no)), pc)) elif barrier == 'merge-gbk': if pc_no % 2 == 0: new_pc_list.append( merge_using_gbk( ('merge-gbk %d.%d' % (step_no, pc_no)), pc, pc_list[pc_no + 1])) else: continue elif barrier == 'merge-side-input': if pc_no % 2 == 0: new_pc_list.append( merge_using_side_input( ('merge-side-input %d.%d' % (step_no, pc_no)), pc, pc_list[pc_no + 1])) else: continue pc_list = new_pc_list new_pc_list = [] for pc_no, pc in enumerate(pc_list): new_pc = pc | 'SyntheticStep %d.%d' % ( step_no, pc_no) >> beam.ParDo( SyntheticStep( per_element_delay_sec=steps['per_element_delay'], per_bundle_delay_sec=steps['per_bundle_delay'], output_records_per_input_record=steps[ 'output_records_per_input_record'], output_filter_ratio=steps['output_filter_ratio'])) new_pc_list.append(new_pc) pc_list = new_pc_list if known_args.output: # If an output location is provided we format and write output. if len(pc_list) == 1: (pc_list[0] | 'FormatOutput' >> beam.Map(lambda elm: (elm[0] + elm[1])) | 'WriteOutput' >> WriteToText(known_args.output)) logging.info('Pipeline run completed.')
def run(apache_beam_pipeline_options: PipelineOptions, data_input: str, reference_input: str, output: str, metric_types: List[str], state_code: Optional[str], person_filter_ids: Optional[List[int]]): """Runs the recidivism calculation pipeline.""" # Workaround to load SQLAlchemy objects at start of pipeline. This is # necessary because the BuildRootEntity function tries to access attributes # of relationship properties on the SQLAlchemy room_schema_class before they # have been loaded. However, if *any* SQLAlchemy objects have been # instantiated, then the relationship properties are loaded and their # attributes can be successfully accessed. _ = schema.StatePerson() apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True # Get pipeline job details all_pipeline_options = apache_beam_pipeline_options.get_all_options() query_dataset = all_pipeline_options['project'] + '.' + data_input reference_dataset = all_pipeline_options['project'] + '.' + reference_input person_id_filter_set = set( person_filter_ids) if person_filter_ids else None with beam.Pipeline(options=apache_beam_pipeline_options) as p: # Get StatePersons persons = ( p | 'Load Persons' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StatePerson, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set)) # Get StateIncarcerationPeriods incarceration_periods = ( p | 'Load IncarcerationPeriods' >> BuildRootEntity( dataset=query_dataset, root_entity_class=entities.StateIncarcerationPeriod, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code)) # Get StateSupervisionViolations supervision_violations = \ (p | 'Load SupervisionViolations' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolation, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # TODO(2769): Don't bring this in as a root entity # Get StateSupervisionViolationResponses supervision_violation_responses = \ (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(dataset=query_dataset, root_entity_class=entities.StateSupervisionViolationResponse, unifying_id_field=entities.StatePerson.get_class_id_name(), build_related_entities=True, unifying_id_field_filter_set=person_id_filter_set, state_code=state_code )) # Group StateSupervisionViolationResponses and # StateSupervisionViolations by person_id supervision_violations_and_responses = ( { 'violations': supervision_violations, 'violation_responses': supervision_violation_responses } | 'Group StateSupervisionViolationResponses to ' 'StateSupervisionViolations' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolation entities on # the corresponding StateSupervisionViolationResponses violation_responses_with_hydrated_violations = ( supervision_violations_and_responses | 'Set hydrated StateSupervisionViolations on ' 'the StateSupervisionViolationResponses' >> beam.ParDo( SetViolationOnViolationsResponse())) # Group StateIncarcerationPeriods and StateSupervisionViolationResponses # by person_id incarceration_periods_and_violation_responses = ( { 'incarceration_periods': incarceration_periods, 'violation_responses': violation_responses_with_hydrated_violations } | 'Group StateIncarcerationPeriods to ' 'StateSupervisionViolationResponses' >> beam.CoGroupByKey()) # Set the fully hydrated StateSupervisionViolationResponse entities on # the corresponding StateIncarcerationPeriods incarceration_periods_with_source_violations = ( incarceration_periods_and_violation_responses | 'Set hydrated StateSupervisionViolationResponses on ' 'the StateIncarcerationPeriods' >> beam.ParDo( SetViolationResponseOnIncarcerationPeriod())) # Group each StatePerson with their StateIncarcerationPeriods person_and_incarceration_periods = ( { 'person': persons, 'incarceration_periods': incarceration_periods_with_source_violations } | 'Group StatePerson to StateIncarcerationPeriods' >> beam.CoGroupByKey()) # Bring in the table that associates people and their county of residence person_id_to_county_query = select_all_by_person_query( reference_dataset, PERSONS_TO_RECENT_COUNTY_OF_RESIDENCE_VIEW_NAME, # TODO(3602): Once we put state_code on StatePerson objects, we can update the # persons_to_recent_county_of_residence query to have a state_code field, allowing us to also filter the # output by state_code. state_code_filter=None, person_id_filter_set=person_id_filter_set) person_id_to_county_kv = ( p | "Read person_id to county associations from BigQuery" >> beam.io.Read( beam.io.BigQuerySource(query=person_id_to_county_query, use_standard_sql=True)) | "Convert person_id to county association table to KV" >> beam.ParDo(ConvertDictToKVTuple(), 'person_id')) # Identify ReleaseEvents events from the StatePerson's # StateIncarcerationPeriods person_events = ( person_and_incarceration_periods | "ClassifyReleaseEvents" >> beam.ParDo( ClassifyReleaseEvents(), AsDict(person_id_to_county_kv))) # Get pipeline job details for accessing job_id all_pipeline_options = apache_beam_pipeline_options.get_all_options() # Add timestamp for local jobs job_timestamp = datetime.datetime.now().strftime( '%Y-%m-%d_%H_%M_%S.%f') all_pipeline_options['job_timestamp'] = job_timestamp # Get the type of metric to calculate metric_types_set = set(metric_types) # Get recidivism metrics recidivism_metrics = (person_events | 'Get Recidivism Metrics' >> GetRecidivismMetrics( pipeline_options=all_pipeline_options, metric_types=metric_types_set)) if person_id_filter_set: logging.warning( "Non-empty person filter set - returning before writing metrics." ) return # Convert the metrics into a format that's writable to BQ writable_metrics = ( recidivism_metrics | 'Convert to dict to be written to BQ' >> beam.ParDo( RecidivismMetricWritableDict()).with_outputs( 'rates', 'counts')) # Write the recidivism metrics to the output tables in BigQuery rates_table_id = DATAFLOW_METRICS_TO_TABLES.get( ReincarcerationRecidivismRateMetric) counts_table_id = DATAFLOW_METRICS_TO_TABLES.get( ReincarcerationRecidivismCountMetric) _ = (writable_metrics.rates | f"Write rate metrics to BQ table: {rates_table_id}" >> beam.io.WriteToBigQuery( table=rates_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS)) _ = (writable_metrics.counts | f"Write count metrics to BQ table: {counts_table_id}" >> beam.io.WriteToBigQuery( table=counts_table_id, dataset=output, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, method=beam.io.WriteToBigQuery.Method.FILE_LOADS))